5 files changed, 535 insertions, 171 deletions
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 89257fa218..ee73325c07 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -242,7 +242,7 @@ typedef struct VP9Context {
     // whole-frame cache
     uint8_t *intra_pred_data[3];
     struct VP9Filter *lflvl;
-    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135*144];
 
     // block reconstruction intermediates
     int block_alloc_using_2pass;
@@ -251,6 +251,8 @@ typedef struct VP9Context {
     struct { int x, y; } min_mv, max_mv;
     DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
     DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
+    uint16_t mvscale[3][2];
+    uint8_t mvstep[3][2];
 } VP9Context;
 
 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
@@ -577,6 +579,26 @@ static int decode_frame_header(AVCodecContext *ctx,
                     s->varcompref[1] = 2;
                 }
             }
+
+            for (i = 0; i < 3; i++) {
+                AVFrame *ref = s->refs[s->refidx[i]].f;
+                int refw = ref->width, refh = ref->height;
+
+                if (refw == w && refh == h) {
+                    s->mvscale[i][0] = s->mvscale[i][1] = 0;
+                } else {
+                    if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) {
+                        av_log(ctx, AV_LOG_ERROR,
+                               "Invalid ref frame dimensions %dx%d for frame size %dx%d\n",
+                               refw, refh, w, h);
+                        return AVERROR_INVALIDDATA;
+                    }
+                    s->mvscale[i][0] = (refw << 14) / w;
+                    s->mvscale[i][1] = (refh << 14) / h;
+                    s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14;
+                    s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14;
+                }
+            }
         }
     }
     s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
@@ -2524,12 +2546,118 @@ static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
     }
 }
 
-static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
-                                         uint8_t *dst, ptrdiff_t dst_stride,
-                                         const uint8_t *ref, ptrdiff_t ref_stride,
-                                         ThreadFrame *ref_frame,
-                                         ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
-                                         int bw, int bh, int w, int h)
+static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                            uint8_t *dst, ptrdiff_t dst_stride,
+                                            const uint8_t *ref, ptrdiff_t ref_stride,
+                                            ThreadFrame *ref_frame,
+                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                            int bw, int bh, int w, int h,
+                                            const uint16_t *scale, const uint8_t *step)
+{
+#define scale_mv(n, dim) (((int64_t)n * scale[dim]) >> 14)
+    // BUG libvpx seems to scale the two components separately. This introduces
+    // rounding errors but we have to reproduce them to be exactly compatible
+    // with the output from libvpx...
+    int mx = scale_mv(mv->x * 2, 0) + scale_mv(x * 16, 0);
+    int my = scale_mv(mv->y * 2, 1) + scale_mv(y * 16, 1);
+    int refbw_m1, refbh_m1;
+    int th;
+
+    y = my >> 4;
+    x = mx >> 4;
+    ref += y * ref_stride + x;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> 6;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref - 3 * ref_stride - 3,
+                                 144, ref_stride,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref = s->edge_emu_buffer + 3 * 144 + 3;
+        ref_stride = 144;
+    }
+    smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
+}
+
+static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
+                                              uint8_t *dst_u, uint8_t *dst_v,
+                                              ptrdiff_t dst_stride,
+                                              const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                              const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                              int bw, int bh, int w, int h,
+                                              const uint16_t *scale, const uint8_t *step)
+{
+    // BUG https://code.google.com/p/webm/issues/detail?id=820
+    int mx = scale_mv(mv->x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
+    int my = scale_mv(mv->y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
+#undef scale_mv
+    int refbw_m1, refbh_m1;
+    int th;
+
+    y = my >> 4;
+    x = mx >> 4;
+    ref_u += y * src_stride_u + x;
+    ref_v += y * src_stride_v + x;
+    mx &= 15;
+    my &= 15;
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
+    // the longest loopfilter of the next sbrow
+    th = (y + refbh_m1 + 4 + 7) >> 5;
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 4 >= h - refbh_m1) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_u - 3 * src_stride_u - 3,
+                                 144, src_stride_u,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_u = s->edge_emu_buffer + 3 * 144 + 3;
+        smc(dst_u, dst_stride, ref_u, 144, bh, mx, my, step[0], step[1]);
+
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_v - 3 * src_stride_v - 3,
+                                 144, src_stride_v,
+                                 refbw_m1 + 8, refbh_m1 + 8,
+                                 x - 3, y - 3, w, h);
+        ref_v = s->edge_emu_buffer + 3 * 144 + 3;
+        smc(dst_v, dst_stride, ref_v, 144, bh, mx, my, step[0], step[1]);
+    } else {
+        smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
+        smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
+    }
+}
+
+#define FN(x) x##_scaled
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \
+    mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                   mv, bw, bh, w, h, s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, bw, bh, w, h, i) \
+    mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                     row, col, mv, bw, bh, w, h, s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#include "vp9_mc_template.c"
+#undef mc_luma_dir
+#undef mc_chroma_dir
+#undef FN
+
+static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
+                                              uint8_t *dst, ptrdiff_t dst_stride,
+                                              const uint8_t *ref, ptrdiff_t ref_stride,
+                                              ThreadFrame *ref_frame,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                              int bw, int bh, int w, int h)
 {
     int mx = mv->x, my = mv->y, th;
 
@@ -2556,14 +2684,14 @@ static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
     mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
 }
 
-static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
-                                           uint8_t *dst_u, uint8_t *dst_v,
-                                           ptrdiff_t dst_stride,
-                                           const uint8_t *ref_u, ptrdiff_t src_stride_u,
-                                           const uint8_t *ref_v, ptrdiff_t src_stride_v,
-                                           ThreadFrame *ref_frame,
-                                           ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
-                                           int bw, int bh, int w, int h)
+static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
+                                                uint8_t *dst_u, uint8_t *dst_v,
+                                                ptrdiff_t dst_stride,
+                                                const uint8_t *ref_u, ptrdiff_t src_stride_u,
+                                                const uint8_t *ref_v, ptrdiff_t src_stride_v,
+                                                ThreadFrame *ref_frame,
+                                                ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                                int bw, int bh, int w, int h)
 {
     int mx = mv->x, my = mv->y, th;
 
@@ -2601,156 +2729,32 @@ static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
     }
 }
 
+#define FN(x) x
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \
+    mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
+                     mv, bw, bh, w, h)
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                      row, col, mv, bw, bh, w, h, i) \
+    mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
+                       row, col, mv, bw, bh, w, h)
+#include "vp9_mc_template.c"
+#undef mc_luma_dir_dir
+#undef mc_chroma_dir_dir
+#undef FN
+
 static void inter_recon(AVCodecContext *ctx)
 {
-    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
-        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
-        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
-    };
     VP9Context *s = ctx->priv_data;
     VP9Block *b = s->b;
     int row = s->row, col = s->col;
-    ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
-    AVFrame *ref1 = tref1->f, *ref2;
-    int w1 = ref1->width, h1 = ref1->height, w2, h2;
-    ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
-
-    if (b->comp) {
-        tref2 = &s->refs[s->refidx[b->ref[1]]];
-        ref2 = tref2->f;
-        w2 = ref2->width;
-        h2 = ref2->height;
-    }
 
-    // y inter pred
-    if (b->bs > BS_8x8) {
-        if (b->bs == BS_8x4) {
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
-                        s->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
-                            s->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
-            }
-        } else if (b->bs == BS_4x8) {
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
-            }
-        } else {
-            av_assert2(b->bs == BS_4x4);
-
-            // FIXME if two horizontally adjacent blocks have the same MV,
-            // do a w8 instead of a w4 call
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        s->dst[0] + 4 * ls_y, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
-            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
-                        s->dst[0] + 4 * ls_y + 4, ls_y,
-                        ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
-
-            if (b->comp) {
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            s->dst[0] + 4 * ls_y, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
-                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
-                            s->dst[0] + 4 * ls_y + 4, ls_y,
-                            ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
-            }
-        }
+    if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
+        inter_pred_scaled(ctx);
     } else {
-        int bwl = bwlog_tab[0][b->bs];
-        int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
-
-        mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
-                    ref1->data[0], ref1->linesize[0], tref1,
-                    row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
-
-        if (b->comp)
-            mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
-                        ref2->data[0], ref2->linesize[0], tref2,
-                        row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
-    }
-
-    // uv inter pred
-    {
-        int bwl = bwlog_tab[1][b->bs];
-        int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
-        VP56mv mvuv;
-
-        w1 = (w1 + 1) >> 1;
-        h1 = (h1 + 1) >> 1;
-        if (b->comp) {
-            w2 = (w2 + 1) >> 1;
-            h2 = (h2 + 1) >> 1;
-        }
-        if (b->bs > BS_8x8) {
-            mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
-            mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
-        } else {
-            mvuv = b->mv[0][0];
-        }
-
-        mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
-                      s->dst[1], s->dst[2], ls_uv,
-                      ref1->data[1], ref1->linesize[1],
-                      ref1->data[2], ref1->linesize[2], tref1,
-                      row << 2, col << 2, &mvuv, bw, bh, w1, h1);
-
-        if (b->comp) {
-            if (b->bs > BS_8x8) {
-                mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
-                mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
-            } else {
-                mvuv = b->mv[0][1];
-            }
-            mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
-                          s->dst[1], s->dst[2], ls_uv,
-                          ref2->data[1], ref2->linesize[1],
-                          ref2->data[2], ref2->linesize[2], tref2,
-                          row << 2, col << 2, &mvuv, bw, bh, w2, h2);
-        }
+        inter_pred(ctx);
     }
-
     if (!b->skip) {
-        /* mostly copied intra_reconn() */
+        /* mostly copied intra_recon() */
 
         int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
         int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
diff --git a/libavcodec/vp9_mc_template.c b/libavcodec/vp9_mc_template.c
new file mode 100644
index 0000000000..c6ae432e26
--- /dev/null
+++ b/libavcodec/vp9_mc_template.c
@@ -0,0 +1,171 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+static void FN(inter_pred)(AVCodecContext *ctx)
+{
+    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
+        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
+        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
+    };
+    VP9Context *s = ctx->priv_data;
+    VP9Block *b = s->b;
+    int row = s->row, col = s->col;
+    ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
+    AVFrame *ref1 = tref1->f, *ref2;
+    int w1 = ref1->width, h1 = ref1->height, w2, h2;
+    ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
+
+    if (b->comp) {
+        tref2 = &s->refs[s->refidx[b->ref[1]]];
+        ref2 = tref2->f;
+        w2 = ref2->width;
+        h2 = ref2->height;
+    }
+
+    // y inter pred
+    if (b->bs > BS_8x8) {
+        if (b->bs == BS_8x4) {
+            mc_luma_dir(s, mc[3][b->filter][0], s->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[3][b->filter][0],
+                        s->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1, 0);
+
+            if (b->comp) {
+                mc_luma_dir(s, mc[3][b->filter][1], s->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[3][b->filter][1],
+                            s->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2, 1);
+            }
+        } else if (b->bs == BS_4x8) {
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1, 0);
+
+            if (b->comp) {
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2, 1);
+            }
+        } else {
+            av_assert2(b->bs == BS_4x4);
+
+            // FIXME if two horizontally adjacent blocks have the same MV,
+            // do a w8 instead of a w4 call
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0],
+                        s->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1, 0);
+            mc_luma_dir(s, mc[4][b->filter][0],
+                        s->dst[0] + 4 * ls_y + 4, ls_y,
+                        ref1->data[0], ref1->linesize[0], tref1,
+                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1, 0);
+
+            if (b->comp) {
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1],
+                            s->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2, 1);
+                mc_luma_dir(s, mc[4][b->filter][1],
+                            s->dst[0] + 4 * ls_y + 4, ls_y,
+                            ref2->data[0], ref2->linesize[0], tref2,
+                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2, 1);
+            }
+        }
+    } else {
+        int bwl = bwlog_tab[0][b->bs];
+        int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
+
+        mc_luma_dir(s, mc[bwl][b->filter][0], s->dst[0], ls_y,
+                    ref1->data[0], ref1->linesize[0], tref1,
+                    row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1, 0);
+
+        if (b->comp)
+            mc_luma_dir(s, mc[bwl][b->filter][1], s->dst[0], ls_y,
+                        ref2->data[0], ref2->linesize[0], tref2,
+                        row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2, 1);
+    }
+
+    // uv inter pred
+    {
+        int bwl = bwlog_tab[1][b->bs];
+        int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
+        VP56mv mvuv;
+
+        w1 = (w1 + 1) >> 1;
+        h1 = (h1 + 1) >> 1;
+        if (b->comp) {
+            w2 = (w2 + 1) >> 1;
+            h2 = (h2 + 1) >> 1;
+        }
+        if (b->bs > BS_8x8) {
+            mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
+            mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
+        } else {
+            mvuv = b->mv[0][0];
+        }
+
+        mc_chroma_dir(s, mc[bwl][b->filter][0],
+                      s->dst[1], s->dst[2], ls_uv,
+                      ref1->data[1], ref1->linesize[1],
+                      ref1->data[2], ref1->linesize[2], tref1,
+                      row << 2, col << 2, &mvuv, bw, bh, w1, h1, 0);
+
+        if (b->comp) {
+            if (b->bs > BS_8x8) {
+                mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
+                mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
+            } else {
+                mvuv = b->mv[0][1];
+            }
+            mc_chroma_dir(s, mc[bwl][b->filter][1],
+                          s->dst[1], s->dst[2], ls_uv,
+                          ref2->data[1], ref2->linesize[1],
+                          ref2->data[2], ref2->linesize[2], tref2,
+                          row << 2, col << 2, &mvuv, bw, bh, w2, h2, 1);
+        }
+    }
+}
diff --git a/libavcodec/vp9_parser.c b/libavcodec/vp9_parser.c
index 220290fbf1..ab33c33414 100644
--- a/libavcodec/vp9_parser.c
+++ b/libavcodec/vp9_parser.c
@@ -1,5 +1,8 @@
 /*
- * Copyright (C) 2008 Michael Niedermayer
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
  *
  * This file is part of FFmpeg.
  *
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
index 4d4518748a..19b93cf073 100644
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -1707,8 +1707,9 @@ copy_avg_fn(4)
 #undef fpel_fn
 #undef copy_avg_fn
 
-static const int8_t vp9_subpel_filters[3][15][8] = {
+static const int16_t vp9_subpel_filters[3][16][8] = {
     [FILTER_8TAP_REGULAR] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
         {  0,  1,  -5, 126,   8,  -3,  1,  0 },
         { -1,  3, -10, 122,  18,  -6,  2,  0 },
         { -1,  4, -13, 118,  27,  -9,  3, -1 },
@@ -1725,6 +1726,7 @@ static const int8_t vp9_subpel_filters[3][15][8] = {
         {  0,  2,  -6,  18, 122, -10,  3, -1 },
         {  0,  1,  -3,   8, 126,  -5,  1,  0 },
     }, [FILTER_8TAP_SHARP] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
         { -1,  3,  -7, 127,   8,  -3,  1,  0 },
         { -2,  5, -13, 125,  17,  -6,  3, -1 },
         { -3,  7, -17, 121,  27, -10,  5, -2 },
@@ -1741,6 +1743,7 @@ static const int8_t vp9_subpel_filters[3][15][8] = {
         { -1,  3,  -6,  17, 125, -13,  5, -2 },
         {  0,  1,  -3,   8, 127,  -7,  3, -1 },
     }, [FILTER_8TAP_SMOOTH] = {
+        {  0,  0,   0, 128,   0,   0,  0,  0 },
         { -3, -1,  32,  64,  38,   1, -3,  0 },
         { -2, -2,  29,  63,  41,   2, -3,  0 },
         { -2, -2,  26,  63,  43,   4, -4,  0 },
@@ -1772,7 +1775,7 @@ static const int8_t vp9_subpel_filters[3][15][8] = {
 static av_always_inline void do_8tap_1d_c(uint8_t *dst, ptrdiff_t dst_stride,
                                           const uint8_t *src, ptrdiff_t src_stride,
                                           int w, int h, ptrdiff_t ds,
-                                          const int8_t *filter, int avg)
+                                          const int16_t *filter, int avg)
 {
     do {
         int x;
@@ -1792,7 +1795,7 @@ static av_always_inline void do_8tap_1d_c(uint8_t *dst, ptrdiff_t dst_stride,
 #define filter_8tap_1d_fn(opn, opa, dir, ds) \
 static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
                                                 const uint8_t *src, ptrdiff_t src_stride, \
-                                                int w, int h, const int8_t *filter) \
+                                                int w, int h, const int16_t *filter) \
 { \
     do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
 }
@@ -1806,8 +1809,8 @@ filter_8tap_1d_fn(avg, 1, h, 1)
 
 static av_always_inline void do_8tap_2d_c(uint8_t *dst, ptrdiff_t dst_stride,
                                           const uint8_t *src, ptrdiff_t src_stride,
-                                          int w, int h, const int8_t *filterx,
-                                          const int8_t *filtery, int avg)
+                                          int w, int h, const int16_t *filterx,
+                                          const int16_t *filtery, int avg)
 {
     int tmp_h = h + 7;
     uint8_t tmp[64 * 71], *tmp_ptr = tmp;
@@ -1842,8 +1845,8 @@ static av_always_inline void do_8tap_2d_c(uint8_t *dst, ptrdiff_t dst_stride,
 #define filter_8tap_2d_fn(opn, opa) \
 static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
                                            const uint8_t *src, ptrdiff_t src_stride, \
-                                           int w, int h, const int8_t *filterx, \
-                                           const int8_t *filtery) \
+                                           int w, int h, const int16_t *filterx, \
+                                           const int16_t *filtery) \
 { \
     do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \
 }
@@ -1853,15 +1856,13 @@ filter_8tap_2d_fn(avg, 1)
 
 #undef filter_8tap_2d_fn
 
-#undef FILTER_8TAP
-
 #define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
 static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
                                               const uint8_t *src, ptrdiff_t src_stride, \
                                               int h, int mx, int my) \
 { \
     avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \
-                            vp9_subpel_filters[type_idx][dir_m - 1]); \
+                            vp9_subpel_filters[type_idx][dir_m]); \
 }
 
 #define filter_fn_2d(sz, type, type_idx, avg) \
@@ -1870,8 +1871,8 @@ static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
                                            int h, int mx, int my) \
 { \
     avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
-                       vp9_subpel_filters[type_idx][mx - 1], \
-                       vp9_subpel_filters[type_idx][my - 1]); \
+                       vp9_subpel_filters[type_idx][mx], \
+                       vp9_subpel_filters[type_idx][my]); \
 }
 
 #define FILTER_BILIN(src, x, mxy, stride) \
@@ -1957,8 +1958,6 @@ bilin_2d_fn(avg, 1)
 
 #undef bilin_2d_fn
 
-#undef FILTER_BILIN
-
 #define bilinf_fn_1d(sz, dir, dir_m, avg) \
 static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
                                       const uint8_t *src, ptrdiff_t src_stride, \
@@ -2053,12 +2052,190 @@ static av_cold void vp9dsp_mc_init(VP9DSPContext *dsp)
 #undef init_subpel3
 }
 
+static av_always_inline void do_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride,
+                                              const uint8_t *src, ptrdiff_t src_stride,
+                                              int w, int h, int mx, int my,
+                                              int dx, int dy, int avg,
+                                              const int16_t (*filters)[8])
+{
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 8;
+    uint8_t tmp[64 * 135], *tmp_ptr = tmp;
+
+    src -= src_stride * 3;
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+        const int16_t *filter = filters[my];
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_filter_8tap_fn(opn, opa) \
+static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int w, int h, int mx, int my, int dx, int dy, \
+                                            const int16_t (*filters)[8]) \
+{ \
+    do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
+                     opa, filters); \
+}
+
+scaled_filter_8tap_fn(put, 0)
+scaled_filter_8tap_fn(avg, 1)
+
+#undef scaled_filter_8tap_fn
+
+#undef FILTER_8TAP
+
+#define scaled_filter_fn(sz, type, type_idx, avg) \
+static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                           const uint8_t *src, ptrdiff_t src_stride, \
+                                           int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \
+                        vp9_subpel_filters[type_idx]); \
+}
+
+static av_always_inline void do_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride,
+                                               const uint8_t *src, ptrdiff_t src_stride,
+                                               int w, int h, int mx, int my,
+                                               int dx, int dy, int avg)
+{
+    uint8_t tmp[64 * 129], *tmp_ptr = tmp;
+    int tmp_h = (((h - 1) * dy + my) >> 4) + 2;
+
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1);
+            imx += dx;
+            ioff += imx >> 4;
+            imx &= 0xf;
+        }
+
+        tmp_ptr += 64;
+        src += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg) {
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            } else {
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+            }
+
+        my += dy;
+        tmp_ptr += (my >> 4) * 64;
+        my &= 0xf;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define scaled_bilin_fn(opn, opa) \
+static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                             const uint8_t *src, ptrdiff_t src_stride, \
+                                             int w, int h, int mx, int my, int dx, int dy) \
+{ \
+    do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \
+}
+
+scaled_bilin_fn(put, 0)
+scaled_bilin_fn(avg, 1)
+
+#undef scaled_bilin_fn
+
+#undef FILTER_BILIN
+
+#define scaled_bilinf_fn(sz, avg) \
+static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
+                                        const uint8_t *src, ptrdiff_t src_stride, \
+                                        int h, int mx, int my, int dx, int dy) \
+{ \
+    avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \
+}
+
+#define scaled_filter_fns(sz, avg) \
+scaled_filter_fn(sz,        regular, FILTER_8TAP_REGULAR, avg) \
+scaled_filter_fn(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
+scaled_filter_fn(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
+scaled_bilinf_fn(sz,                                      avg)
+
+#define scaled_filter_fn_set(avg) \
+scaled_filter_fns(64, avg) \
+scaled_filter_fns(32, avg) \
+scaled_filter_fns(16, avg) \
+scaled_filter_fns(8,  avg) \
+scaled_filter_fns(4,  avg)
+
+scaled_filter_fn_set(put)
+scaled_filter_fn_set(avg)
+
+#undef scaled_filter_fns
+#undef scaled_filter_fn_set
+#undef scaled_filter_fn
+#undef scaled_bilinf_fn
+
+static av_cold void vp9dsp_scaled_mc_init(VP9DSPContext *dsp)
+{
+#define init_scaled(idx1, idx2, sz, type) \
+    dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \
+    dsp->smc[idx1][FILTER_8TAP_SHARP  ][idx2] = type##_scaled_sharp_##sz##_c; \
+    dsp->smc[idx1][FILTER_BILINEAR    ][idx2] = type##_scaled_bilin_##sz##_c
+
+#define init_scaled_put_avg(idx, sz) \
+    init_scaled(idx, 0, sz, put); \
+    init_scaled(idx, 1, sz, avg)
+
+    init_scaled_put_avg(0, 64);
+    init_scaled_put_avg(1, 32);
+    init_scaled_put_avg(2, 16);
+    init_scaled_put_avg(3,  8);
+    init_scaled_put_avg(4,  4);
+
+#undef init_scaled_put_avg
+#undef init_scaled
+}
+
 av_cold void ff_vp9dsp_init(VP9DSPContext *dsp)
 {
     vp9dsp_intrapred_init(dsp);
     vp9dsp_itxfm_init(dsp);
     vp9dsp_loopfilter_init(dsp);
     vp9dsp_mc_init(dsp);
+    vp9dsp_scaled_mc_init(dsp);
 
     if (ARCH_X86) ff_vp9dsp_init_x86(dsp);
 }
diff --git a/libavcodec/vp9dsp.h b/libavcodec/vp9dsp.h
index db0a92e210..33dfc09acd 100644
--- a/libavcodec/vp9dsp.h
+++ b/libavcodec/vp9dsp.h
@@ -32,6 +32,9 @@
 typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
                             const uint8_t *ref, ptrdiff_t ref_stride,
                             int h, int mx, int my);
+typedef void (*vp9_scaled_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+                                   const uint8_t *ref, ptrdiff_t ref_stride,
+                                   int h, int mx, int my, int dx, int dy);
 
 typedef struct VP9DSPContext {
     /*
@@ -109,6 +112,12 @@ typedef struct VP9DSPContext {
      * dst/stride are aligned by hsize
      */
     vp9_mc_func mc[5][4][2][2][2];
+
+    /*
+     * for scalable MC, first 3 dimensions identical to above, the other two
+     * don't exist since it changes per stepsize.
+     */
+    vp9_scaled_mc_func smc[5][4][2];
 } VP9DSPContext;
 
 void ff_vp9dsp_init(VP9DSPContext *dsp);