56 files changed, 6278 insertions, 1027 deletions
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 5c1d118383..8bc8bc528c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -6,18 +6,21 @@ OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
 OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
-OBJS-$(CONFIG_IMDCT15)                  += aarch64/imdct15_init.o
-OBJS-$(CONFIG_MDCT)                     += aarch64/mdct_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
+OBJS-$(CONFIG_VP8DSP)                   += aarch64/vp8dsp_init_aarch64.o
 
 # decoders/encoders
-OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o
+OBJS-$(CONFIG_AAC_DECODER)              += aarch64/aacpsdsp_init_aarch64.o \
+                                           aarch64/sbrdsp_init_aarch64.o
+OBJS-$(CONFIG_DCA_DECODER)              += aarch64/synth_filter_init.o
 OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
-OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
+OBJS-$(CONFIG_VC1DSP)                   += aarch64/vc1dsp_init_aarch64.o
 OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
-OBJS-$(CONFIG_VP9_DECODER)              += aarch64/vp9dsp_init_aarch64.o
+OBJS-$(CONFIG_VP9_DECODER)              += aarch64/vp9dsp_init_10bpp_aarch64.o \
+                                           aarch64/vp9dsp_init_12bpp_aarch64.o \
+                                           aarch64/vp9dsp_init_aarch64.o
 
 # ARMv8 optimizations
 
@@ -27,6 +30,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o
 # NEON optimizations
 
 # subsystems
+NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/sbrdsp_neon.o
 NEON-OBJS-$(CONFIG_FFT)                 += aarch64/fft_neon.o
 NEON-OBJS-$(CONFIG_FMTCONVERT)          += aarch64/fmtconvert_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
@@ -36,14 +40,19 @@ NEON-OBJS-$(CONFIG_H264PRED)            += aarch64/h264pred_neon.o
 NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
                                            aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
-NEON-OBJS-$(CONFIG_IMDCT15)             += aarch64/imdct15_neon.o
+NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_init_aarch64.o      \
+                                           aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
+NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
 
 # decoders/encoders
-NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o               \
-                                           aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
-NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_neon.o             \
+NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
+                                           aarch64/vp9itxfm_neon.o             \
+                                           aarch64/vp9lpf_16bpp_neon.o         \
                                            aarch64/vp9lpf_neon.o               \
+                                           aarch64/vp9mc_16bpp_neon.o          \
                                            aarch64/vp9mc_neon.o
diff --git a/libavcodec/aarch64/aacpsdsp_init_aarch64.c b/libavcodec/aarch64/aacpsdsp_init_aarch64.c
new file mode 100644
index 0000000000..5e7e19bba4
--- /dev/null
+++ b/libavcodec/aarch64/aacpsdsp_init_aarch64.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
+                                float *src1, int n);
+void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
+                                const float (*filter)[8][2],
+                                ptrdiff_t stride, int n);
+void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
+                                   float h[2][4], float h_step[2][4],
+                                   int len);
+void ff_ps_stereo_interpolate_ipdopd_neon(float (*l)[2], float (*r)[2],
+                                          float h[2][4], float h_step[2][4],
+                                          int len);
+
+av_cold void ff_psdsp_init_aarch64(PSDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->add_squares           = ff_ps_add_squares_neon;
+        s->mul_pair_single       = ff_ps_mul_pair_single_neon;
+        s->hybrid_analysis       = ff_ps_hybrid_analysis_neon;
+        s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
+        s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_neon;
+    }
+}
diff --git a/libavcodec/aarch64/aacpsdsp_neon.S b/libavcodec/aarch64/aacpsdsp_neon.S
new file mode 100644
index 0000000000..ff4e6e244a
--- /dev/null
+++ b/libavcodec/aarch64/aacpsdsp_neon.S
@@ -0,0 +1,148 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_ps_add_squares_neon, export=1
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        fmul        v0.4S, v0.4S, v0.4S
+        fmul        v1.4S, v1.4S, v1.4S
+        faddp       v2.4S, v0.4S, v1.4S
+        ld1         {v3.4S}, [x0]
+        fadd        v3.4S, v3.4S, v2.4S
+        st1         {v3.4S}, [x0], #16
+        subs        w2, w2, #4
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_mul_pair_single_neon, export=1
+1:      ld1         {v0.4S,v1.4S}, [x1], #32
+        ld1         {v2.4S},       [x2], #16
+        zip1        v3.4S, v2.4S, v2.4S
+        zip2        v4.4S, v2.4S, v2.4S
+        fmul        v0.4S, v0.4S, v3.4S
+        fmul        v1.4S, v1.4S, v4.4S
+        st1         {v0.4S,v1.4S}, [x0], #32
+        subs        w3, w3, #4
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_stereo_interpolate_neon, export=1
+        ld1         {v0.4S}, [x2]
+        ld1         {v1.4S}, [x3]
+        zip1        v4.4S, v0.4S, v0.4S
+        zip2        v5.4S, v0.4S, v0.4S
+        zip1        v6.4S, v1.4S, v1.4S
+        zip2        v7.4S, v1.4S, v1.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v4.4S, v4.4S, v6.4S
+        fadd        v5.4S, v5.4S, v7.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v2.4S, v2.4S, v4.4S
+        fmla        v2.4S, v3.4S, v5.4S
+        st1         {v2.D}[0], [x0], #8
+        st1         {v2.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_stereo_interpolate_ipdopd_neon, export=1
+        ld1         {v0.4S,v1.4S}, [x2]
+        ld1         {v6.4S,v7.4S}, [x3]
+        fneg        v2.4S, v1.4S
+        fneg        v3.4S, v7.4S
+        zip1        v16.4S, v0.4S, v0.4S
+        zip2        v17.4S, v0.4S, v0.4S
+        zip1        v18.4S, v2.4S, v1.4S
+        zip2        v19.4S, v2.4S, v1.4S
+        zip1        v20.4S, v6.4S, v6.4S
+        zip2        v21.4S, v6.4S, v6.4S
+        zip1        v22.4S, v3.4S, v7.4S
+        zip2        v23.4S, v3.4S, v7.4S
+1:      ld1         {v2.2S}, [x0]
+        ld1         {v3.2S}, [x1]
+        fadd        v16.4S, v16.4S, v20.4S
+        fadd        v17.4S, v17.4S, v21.4S
+        mov         v2.D[1], v2.D[0]
+        mov         v3.D[1], v3.D[0]
+        fmul        v4.4S, v2.4S, v16.4S
+        fmla        v4.4S, v3.4S, v17.4S
+        fadd        v18.4S, v18.4S, v22.4S
+        fadd        v19.4S, v19.4S, v23.4S
+        ext         v2.16B, v2.16B, v2.16B, #4
+        ext         v3.16B, v3.16B, v3.16B, #4
+        fmla        v4.4S, v2.4S, v18.4S
+        fmla        v4.4S, v3.4S, v19.4S
+        st1         {v4.D}[0], [x0], #8
+        st1         {v4.D}[1], [x1], #8
+        subs        w4, w4, #1
+        b.gt        1b
+        ret
+endfunc
+
+function ff_ps_hybrid_analysis_neon, export=1
+        lsl         x3, x3, #3
+        ld2         {v0.4S,v1.4S}, [x1], #32
+        ld2         {v2.2S,v3.2S}, [x1], #16
+        ld1         {v24.2S},      [x1], #8
+        ld2         {v4.2S,v5.2S}, [x1], #16
+        ld2         {v6.4S,v7.4S}, [x1]
+        rev64       v6.4S, v6.4S
+        rev64       v7.4S, v7.4S
+        ext         v6.16B, v6.16B, v6.16B, #8
+        ext         v7.16B, v7.16B, v7.16B, #8
+        rev64       v4.2S, v4.2S
+        rev64       v5.2S, v5.2S
+        mov         v2.D[1], v3.D[0]
+        mov         v4.D[1], v5.D[0]
+        mov         v5.D[1], v2.D[0]
+        mov         v3.D[1], v4.D[0]
+        fadd        v16.4S, v0.4S, v6.4S
+        fadd        v17.4S, v1.4S, v7.4S
+        fsub        v18.4S, v1.4S, v7.4S
+        fsub        v19.4S, v0.4S, v6.4S
+        fadd        v22.4S, v2.4S, v4.4S
+        fsub        v23.4S, v5.4S, v3.4S
+        trn1        v20.2D, v22.2D, v23.2D      // {re4+re8, re5+re7, im8-im4, im7-im5}
+        trn2        v21.2D, v22.2D, v23.2D      // {im4+im8, im5+im7, re4-re8, re5-re7}
+1:      ld2         {v2.4S,v3.4S}, [x2], #32
+        ld2         {v4.2S,v5.2S}, [x2], #16
+        ld1         {v6.2S},       [x2], #8
+        add         x2, x2, #8
+        mov         v4.D[1], v5.D[0]
+        mov         v6.S[1], v6.S[0]
+        fmul        v6.2S, v6.2S, v24.2S
+        fmul        v0.4S, v2.4S, v16.4S
+        fmul        v1.4S, v2.4S, v17.4S
+        fmls        v0.4S, v3.4S, v18.4S
+        fmla        v1.4S, v3.4S, v19.4S
+        fmla        v0.4S, v4.4S, v20.4S
+        fmla        v1.4S, v4.4S, v21.4S
+        faddp       v0.4S, v0.4S, v1.4S
+        faddp       v0.4S, v0.4S, v0.4S
+        fadd        v0.2S, v0.2S, v6.2S
+        st1         {v0.2S}, [x0], x3
+        subs        w4, w4, #1
+        b.gt        1b
+        ret
+endfunc
diff --git a/libavcodec/aarch64/asm-offsets.h b/libavcodec/aarch64/asm-offsets.h
index 60e32ddd1d..e05c5ad2e4 100644
--- a/libavcodec/aarch64/asm-offsets.h
+++ b/libavcodec/aarch64/asm-offsets.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/cabac.h b/libavcodec/aarch64/cabac.h
index e12953e86c..6b9b77eb30 100644
--- a/libavcodec/aarch64/cabac.h
+++ b/libavcodec/aarch64/cabac.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/dcadsp_neon.S b/libavcodec/aarch64/dcadsp_neon.S
deleted file mode 100644
index 4cd3328042..0000000000
--- a/libavcodec/aarch64/dcadsp_neon.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
-        mov             x3,  #32                // decifactor
-        sub             x1,  x1,  #7*4
-        add             x4,  x0,  #2*32*4 - 16  // out2
-        mov             x7,  #-16
-
-        ld1             {v0.4s,v1.4s}, [x1]
-        // reverse [-num_coeffs + 1, 0]
-        ext             v3.16b, v0.16b, v0.16b, #8
-        ext             v2.16b, v1.16b, v1.16b, #8
-        rev64           v3.4s,  v3.4s
-        rev64           v2.4s,  v2.4s
-1:
-        ld1             {v4.4s,v5.4s}, [x2], #32
-        ld1             {v6.4s,v7.4s}, [x2], #32
-        subs            x3,  x3,  #4
-        fmul            v16.4s, v2.4s,  v4.4s
-        fmul            v23.4s, v0.4s,  v4.4s
-        fmul            v17.4s, v2.4s,  v6.4s
-        fmul            v22.4s, v0.4s,  v6.4s
-
-        fmla            v16.4s, v3.4s,  v5.4s
-        fmla            v23.4s, v1.4s,  v5.4s
-        ld1             {v4.4s,v5.4s}, [x2], #32
-        fmla            v17.4s, v3.4s,  v7.4s
-        fmla            v22.4s, v1.4s,  v7.4s
-        ld1             {v6.4s,v7.4s}, [x2], #32
-        fmul            v18.4s, v2.4s,  v4.4s
-        fmul            v21.4s, v0.4s,  v4.4s
-        fmul            v19.4s, v2.4s,  v6.4s
-        fmul            v20.4s, v0.4s,  v6.4s
-
-        fmla            v18.4s, v3.4s,  v5.4s
-        fmla            v21.4s, v1.4s,  v5.4s
-        fmla            v19.4s, v3.4s,  v7.4s
-        fmla            v20.4s, v1.4s,  v7.4s
-
-        faddp           v16.4s, v16.4s, v17.4s
-        faddp           v18.4s, v18.4s, v19.4s
-        faddp           v20.4s, v20.4s, v21.4s
-        faddp           v22.4s, v22.4s, v23.4s
-        faddp           v16.4s, v16.4s, v18.4s
-        faddp           v20.4s, v20.4s, v22.4s
-
-        st1             {v16.4s}, [x0], #16
-        st1             {v20.4s}, [x4], x7
-        b.gt            1b
-
-        ret
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
-        mov             x3,  #64                // decifactor
-        sub             x1,  x1,  #3*4
-        add             x4,  x0,  #2*64*4 - 16  // out2
-        mov             x7,  #-16
-
-        ld1             {v0.4s}, [x1]
-        // reverse [-num_coeffs + 1, 0]
-        ext             v1.16b, v0.16b, v0.16b, #8
-        rev64           v1.4s,  v1.4s
-
-1:
-        ld1             {v4.4s,v5.4s}, [x2], #32
-        ld1             {v6.4s,v7.4s}, [x2], #32
-        subs            x3,  x3,  #4
-        fmul            v16.4s, v1.4s,  v4.4s
-        fmul            v23.4s, v0.4s,  v4.4s
-        fmul            v17.4s, v1.4s,  v5.4s
-        fmul            v22.4s, v0.4s,  v5.4s
-        fmul            v18.4s, v1.4s,  v6.4s
-        fmul            v21.4s, v0.4s,  v6.4s
-        fmul            v19.4s, v1.4s,  v7.4s
-        fmul            v20.4s, v0.4s,  v7.4s
-        faddp           v16.4s, v16.4s, v17.4s
-        faddp           v18.4s, v18.4s, v19.4s
-        faddp           v20.4s, v20.4s, v21.4s
-        faddp           v22.4s, v22.4s, v23.4s
-        faddp           v16.4s, v16.4s, v18.4s
-        faddp           v20.4s, v20.4s, v22.4s
-        st1             {v16.4s}, [x0], #16
-        st1             {v20.4s}, [x4], x7
-        b.gt            1b
-
-        ret
-endfunc
diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c
index 9cc57d331e..db285205ab 100644
--- a/libavcodec/aarch64/fft_init_aarch64.c
+++ b/libavcodec/aarch64/fft_init_aarch64.c
@@ -1,23 +1,25 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config.h"
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/aarch64/cpu.h"
@@ -27,6 +29,10 @@
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 av_cold void ff_fft_init_aarch64(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -34,5 +40,11 @@ av_cold void ff_fft_init_aarch64(FFTContext *s)
     if (have_neon(cpu_flags)) {
         s->fft_permute  = ff_fft_permute_neon;
         s->fft_calc     = ff_fft_calc_neon;
+#if CONFIG_MDCT
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
     }
 }
diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index e205e23d88..862039f97d 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -8,20 +8,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/fmtconvert_init.c b/libavcodec/aarch64/fmtconvert_init.c
index 0a55a1b88c..210e74b654 100644
--- a/libavcodec/aarch64/fmtconvert_init.c
+++ b/libavcodec/aarch64/fmtconvert_init.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized Format Conversion Utils
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/fmtconvert_neon.S b/libavcodec/aarch64/fmtconvert_neon.S
index 3b33c87ade..2161c3a8ae 100644
--- a/libavcodec/aarch64/fmtconvert_neon.S
+++ b/libavcodec/aarch64/fmtconvert_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2015 Janne Grunau  <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264chroma_init_aarch64.c b/libavcodec/aarch64/h264chroma_init_aarch64.c
index a373291344..fa6e0eaf15 100644
--- a/libavcodec/aarch64/h264chroma_init_aarch64.c
+++ b/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised H.264 chroma functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
index edc256cbc3..8be7578001 100644
--- a/libavcodec/aarch64/h264cmc_neon.S
+++ b/libavcodec/aarch64/h264cmc_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -442,7 +442,7 @@ endconst
         h264_chroma_mc4 avg, rv40
 #endif
 
-#if CONFIG_VC1_DECODER
+#if CONFIG_VC1DSP
         h264_chroma_mc8 put, vc1
         h264_chroma_mc8 avg, vc1
         h264_chroma_mc4 put, vc1
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index 07bda2ff07..10cf199333 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -44,20 +44,20 @@ void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
 void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride,
                                                    int alpha, int beta);
 
-void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, ptrdiff_t stride, int height,
                                    int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, ptrdiff_t stride, int height,
                                   int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, ptrdiff_t stride, int height,
                                   int log2_den, int weight, int offset);
 
-void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                      int height, int log2_den, int weightd,
                                      int weights, int offset);
-void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                     int height, int log2_den, int weightd,
                                     int weights, int offset);
-void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                     int height, int log2_den, int weightd,
                                     int weights, int offset);
 
@@ -91,6 +91,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
         c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
 
         c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        if (chroma_format_idc <= 1)
         c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
         c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
         c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 448e575b8c..80ac09d2be 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 1c43c1f301..7de44205d3 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,7 @@
 #include "neon.S"
 
 function ff_h264_idct_add_neon, export=1
+.L_ff_h264_idct_add_neon:
         ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
         sxtw            x2,     w2
         movi            v30.8H, #0
@@ -77,6 +78,7 @@ function ff_h264_idct_add_neon, export=1
 endfunc
 
 function ff_h264_idct_dc_add_neon, export=1
+.L_ff_h264_idct_dc_add_neon:
         sxtw            x2,  w2
         mov             w3,       #0
         ld1r            {v2.8H},  [x1]
@@ -106,8 +108,8 @@ function ff_h264_idct_add16_neon, export=1
         mov             w9,  w3         // stride
         movrel          x7,  scan8
         mov             x10, #16
-        movrel          x13, X(ff_h264_idct_dc_add_neon)
-        movrel          x14, X(ff_h264_idct_add_neon)
+        movrel          x13, .L_ff_h264_idct_dc_add_neon
+        movrel          x14, .L_ff_h264_idct_add_neon
 1:      mov             w2,  w9
         ldrb            w3,  [x7], #1
         ldrsw           x0,  [x5], #4
@@ -133,8 +135,8 @@ function ff_h264_idct_add16intra_neon, export=1
         mov             w9,  w3         // stride
         movrel          x7,  scan8
         mov             x10, #16
-        movrel          x13, X(ff_h264_idct_dc_add_neon)
-        movrel          x14, X(ff_h264_idct_add_neon)
+        movrel          x13, .L_ff_h264_idct_dc_add_neon
+        movrel          x14, .L_ff_h264_idct_add_neon
 1:      mov             w2,  w9
         ldrb            w3,  [x7], #1
         ldrsw           x0,  [x5], #4
@@ -160,8 +162,8 @@ function ff_h264_idct_add8_neon, export=1
         add             x5,  x1,  #16*4         // block_offset
         add             x9,  x2,  #16*32        // block
         mov             w19, w3                 // stride
-        movrel          x13, X(ff_h264_idct_dc_add_neon)
-        movrel          x14, X(ff_h264_idct_add_neon)
+        movrel          x13, .L_ff_h264_idct_dc_add_neon
+        movrel          x14, .L_ff_h264_idct_add_neon
         movrel          x7,  scan8, 16
         mov             x10, #0
         mov             x11, #16
@@ -263,6 +265,7 @@ endfunc
 .endm
 
 function ff_h264_idct8_add_neon, export=1
+.L_ff_h264_idct8_add_neon:
         movi            v19.8H,   #0
         sxtw            x2,       w2
         ld1             {v24.8H, v25.8H}, [x1]
@@ -326,6 +329,7 @@ function ff_h264_idct8_add_neon, export=1
 endfunc
 
 function ff_h264_idct8_dc_add_neon, export=1
+.L_ff_h264_idct8_dc_add_neon:
         mov             w3,       #0
         sxtw            x2,       w2
         ld1r            {v31.8H}, [x1]
@@ -375,8 +379,8 @@ function ff_h264_idct8_add4_neon, export=1
         mov             w2,  w3
         movrel          x7,  scan8
         mov             w10, #16
-        movrel          x13, X(ff_h264_idct8_dc_add_neon)
-        movrel          x14, X(ff_h264_idct8_add_neon)
+        movrel          x13, .L_ff_h264_idct8_dc_add_neon
+        movrel          x14, .L_ff_h264_idct8_add_neon
 1:      ldrb            w9,  [x7], #4
         ldrsw           x0,  [x5], #16
         ldrb            w9,  [x4, w9, UXTW]
diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
index 8f912cbca9..b144376f90 100644
--- a/libavcodec/aarch64/h264pred_init.c
+++ b/libavcodec/aarch64/h264pred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index a38a27f186..213b40b3e7 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 74088b216c..77f41d9a21 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index 731dc0658d..d27cfac494 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/hpeldsp_init_aarch64.c b/libavcodec/aarch64/hpeldsp_init_aarch64.c
index 6bc4c09f6c..144ae2bcc4 100644
--- a/libavcodec/aarch64/hpeldsp_init_aarch64.c
+++ b/libavcodec/aarch64/hpeldsp_init_aarch64.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S
index 29782908f8..a491c173bb 100644
--- a/libavcodec/aarch64/hpeldsp_neon.S
+++ b/libavcodec/aarch64/hpeldsp_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/idct.h b/libavcodec/aarch64/idct.h
new file mode 100644
index 0000000000..5c49046148
--- /dev/null
+++ b/libavcodec/aarch64/idct.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_IDCT_H
+#define AVCODEC_AARCH64_IDCT_H
+
+#include <stdint.h>
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+
+#endif /* AVCODEC_AARCH64_IDCT_H */
diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c
new file mode 100644
index 0000000000..0406e60830
--- /dev/null
+++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -0,0 +1,41 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+
+av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    if (!avctx->lowres && !high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+            c->idct_put  = ff_simple_idct_put_neon;
+            c->idct_add  = ff_simple_idct_add_neon;
+            c->idct      = ff_simple_idct_neon;
+            c->perm_type = FF_IDCT_PERM_PARTTRANS;
+        }
+    }
+}
diff --git a/libavcodec/aarch64/imdct15_init.c b/libavcodec/aarch64/imdct15_init.c
deleted file mode 100644
index 38018f2b4a..0000000000
--- a/libavcodec/aarch64/imdct15_init.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-
-#include "libavutil/cpu.h"
-#include "libavutil/aarch64/cpu.h"
-#include "libavutil/internal.h"
-
-#include "libavcodec/imdct15.h"
-
-#include "asm-offsets.h"
-
-AV_CHECK_OFFSET(IMDCT15Context, exptab,         CELT_EXPTAB);
-AV_CHECK_OFFSET(IMDCT15Context, fft_n,          CELT_FFT_N);
-AV_CHECK_OFFSET(IMDCT15Context, len2,           CELT_LEN2);
-AV_CHECK_OFFSET(IMDCT15Context, len4,           CELT_LEN4);
-AV_CHECK_OFFSET(IMDCT15Context, tmp,            CELT_TMP);
-AV_CHECK_OFFSET(IMDCT15Context, twiddle_exptab, CELT_TWIDDLE);
-
-void ff_celt_imdct_half_neon(IMDCT15Context *s, float *dst, const float *src,
-                             ptrdiff_t stride, float scale);
-
-void ff_imdct15_init_aarch64(IMDCT15Context *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        s->imdct_half = ff_celt_imdct_half_neon;
-    }
-}
diff --git a/libavcodec/aarch64/imdct15_neon.S b/libavcodec/aarch64/imdct15_neon.S
deleted file mode 100644
index d99edf4108..0000000000
--- a/libavcodec/aarch64/imdct15_neon.S
+++ /dev/null
@@ -1,647 +0,0 @@
-/*
- * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-#include "asm-offsets.h"
-
-.macro shuffle a, b, c, d
-const shuffle_\a\b\c\d, align=4
-        .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
-        .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
-        .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
-        .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
-endconst
-.endm
-
-shuffle 0, 2, 1, 3
-shuffle 1, 0, 3, 2
-shuffle 2, 3, 0, 1
-shuffle 3, 1, 2, 0
-
-
-function fft5_neon
-        lsl             x2,  x2,  #3
-        ld1             {v24.2s},         [x1],  x2
-        ld2             {v25.s,v26.s}[0], [x1],  x2
-        ld2             {v25.s,v26.s}[1], [x1],  x2
-        ld2             {v25.s,v26.s}[2], [x1],  x2
-        ld2             {v25.s,v26.s}[3], [x1]
-        dup             v6.4s,  v24.s[0]
-        dup             v7.4s,  v24.s[1]
-
-        faddp           v0.4s,  v25.4s, v26.4s
-        // z[][0], z[][3]
-        fmul            v16.4s, v25.4s, v15.s[0] // rr
-        fmul            v17.4s, v25.4s, v15.s[1] // ri
-        fmul            v18.4s, v26.4s, v15.s[0] // ir
-        fmul            v19.4s, v26.4s, v15.s[1] // ii
-        faddp           v0.4s,  v0.4s,  v0.4s
-        // z[][1], z[][2]
-        fmul            v20.4s, v25.4s, v15.s[2] // rr
-        fmul            v21.4s, v25.4s, v15.s[3] // ri
-        fmul            v22.4s, v26.4s, v15.s[2] // ir
-        fmul            v23.4s, v26.4s, v15.s[3] // ii
-        fadd            v0.2s,  v24.2s, v0.2s   // out[0]
-
-        // z[0123][0], z[0123][3]
-        fsub            v24.4s, v16.4s, v19.4s  //    (c).re =  rr - ii;
-        fadd            v27.4s, v16.4s, v19.4s  //    (d).re =  rr + ii;
-        ld1             {v16.16b},  [x11]
-        ld1             {v19.16b},  [x14]
-        fadd            v28.4s, v17.4s, v18.4s  //    (c).im =  ri + ir;
-        fsub            v31.4s, v18.4s, v17.4s  //    (d).im = -ri + ir;
-        ld1             {v17.16b},  [x12]
-        // z[0123][1], z[0123][2]
-        fsub            v25.4s, v20.4s, v23.4s  //    (c).re =  rr - ii;
-        fadd            v26.4s, v20.4s, v23.4s  //    (d).re =  rr + ii;
-        ld1             {v18.16b},  [x13]
-        fadd            v29.4s, v21.4s, v22.4s  //    (c).im =  ri + ir;
-        fsub            v30.4s, v22.4s, v21.4s  //    (d).im = -ri + ir;
-
-        //real
-        tbl             v20.16b, {v24.16b}, v16.16b
-        tbl             v21.16b, {v25.16b}, v17.16b
-        tbl             v22.16b, {v26.16b}, v18.16b
-        tbl             v23.16b, {v27.16b}, v19.16b
-        //imag
-        tbl             v16.16b, {v28.16b}, v16.16b
-        tbl             v17.16b, {v29.16b}, v17.16b
-        tbl             v18.16b, {v30.16b}, v18.16b
-        tbl             v19.16b, {v31.16b}, v19.16b
-
-        fadd            v6.4s,  v6.4s,  v20.4s
-        fadd            v22.4s, v22.4s, v23.4s
-        fadd            v7.4s,  v7.4s,  v16.4s
-        fadd            v18.4s, v18.4s, v19.4s
-
-        fadd            v21.4s, v21.4s, v22.4s
-        fadd            v17.4s, v17.4s, v18.4s
-        fadd            v6.4s,  v6.4s,  v21.4s
-        fadd            v7.4s,  v7.4s,  v17.4s
-
-        ret
-endfunc
-
-function fft15_neon
-        mov             x8,  x1
-        mov             x9,  x30
-        add             x2,  x3,  x3,  lsl #1   // 3 * stride
-
-        add             x1,  x8,  x3,  lsl #3   // in + 1 * stride
-        bl              fft5_neon
-        mov             v1.8b,   v0.8b
-        mov             v2.16b,  v6.16b
-        mov             v3.16b,  v7.16b
-
-        add             x1,  x8,  x3,  lsl #4   // in + 2 * stride
-        add             x2,  x3,  x3,  lsl #1   // 3 * stride
-        bl              fft5_neon
-        zip1            v1.4s,   v1.4s,  v0.4s
-        mov             v4.16b,  v6.16b
-        mov             v5.16b,  v7.16b
-
-        mov             x1,  x8                 // in + 0 * stride
-        add             x2,  x3,  x3,  lsl #1   // 3 * stride
-        bl              fft5_neon
-
-        faddp           v20.4s, v1.4s,  v1.4s
-
-        ext             v18.16b, v8.16b,  v8.16b,  #4
-        ext             v19.16b, v9.16b,  v9.16b,  #4
-        mov             v16.16b, v6.16b
-        mov             v17.16b, v7.16b
-        fadd            v20.2s, v20.2s, v0.2s
-
-        uzp1            v18.4s, v18.4s, v10.4s  // exp[2,4,6,8].re
-        uzp1            v19.4s, v19.4s, v11.4s  // exp[2,4,6,8].im
-
-        st1             {v20.2s},  [x0], #8     // out[0]
-
-        fmla            v16.4s, v2.4s,  v8.4s
-        fmls            v16.4s, v3.4s,  v9.4s
-
-        fmla            v17.4s, v2.4s,  v9.4s
-        fmla            v17.4s, v3.4s,  v8.4s
-
-        fmla            v16.4s, v4.4s,  v18.4s
-        fmls            v16.4s, v5.4s,  v19.4s
-
-        fmla            v17.4s, v4.4s,  v19.4s
-        fmla            v17.4s, v5.4s,  v18.4s
-
-        zip1            v18.4s, v16.4s, v17.4s
-        zip2            v19.4s, v16.4s, v17.4s
-
-        rev64           v31.4s, v14.4s
-        trn1            v28.2d, v1.2d,  v1.2d
-        trn2            v29.2d, v1.2d,  v1.2d
-        zip1            v30.2d, v14.2d, v31.2d
-        zip2            v31.2d, v14.2d, v31.2d
-
-        st1             {v18.4s,v19.4s},  [x0], #32 // out[1-4]
-
-        fmul            v16.4s, v28.4s, v30.4s
-        fmul            v17.4s, v29.4s, v30.4s
-        fmls            v16.4s, v29.4s, v31.4s
-        fmla            v17.4s, v28.4s, v31.4s
-        faddp           v16.4s, v16.4s, v16.4s
-        faddp           v17.4s, v17.4s, v17.4s
-        zip1            v18.2s, v16.2s, v17.2s
-        zip2            v19.2s, v16.2s, v17.2s
-
-        fadd            v18.2s, v18.2s, v0.2s
-        fadd            v0.2s,  v19.2s, v0.2s
-
-        ext             v30.16b, v12.16b, v12.16b, #4
-        ext             v31.16b, v13.16b, v13.16b, #4
-        mov             v16.16b, v6.16b
-        mov             v17.16b, v7.16b
-
-        uzp1            v30.4s, v30.4s, v8.4s
-        uzp1            v31.4s, v31.4s, v9.4s
-
-        st1             {v18.2s},  [x0], #8     // out[5]
-
-        fmla            v16.4s, v2.4s,  v10.4s
-        fmls            v16.4s, v3.4s,  v11.4s
-
-        fmla            v17.4s, v2.4s,  v11.4s
-        fmla            v17.4s, v3.4s,  v10.4s
-
-        fmla            v16.4s, v4.4s,  v30.4s
-        fmls            v16.4s, v5.4s,  v31.4s
-
-        fmla            v17.4s, v4.4s,  v31.4s
-        fmla            v17.4s, v5.4s,  v30.4s
-
-        zip1            v18.4s, v16.4s, v17.4s
-        zip2            v19.4s, v16.4s, v17.4s
-
-        ext             v30.16b, v10.16b, v10.16b, #4
-        ext             v31.16b, v11.16b, v11.16b, #4
-
-        fmla            v6.4s,  v2.4s,  v12.4s
-        fmls            v6.4s,  v3.4s,  v13.4s
-
-        st1             {v18.4s,v19.4s},  [x0], #32 // out[6-9]
-
-        uzp1            v30.4s, v30.4s, v12.4s
-        uzp1            v31.4s, v31.4s, v13.4s
-
-        fmla            v7.4s,  v2.4s,  v13.4s
-        fmla            v7.4s,  v3.4s,  v12.4s
-
-        st1             {v0.2s},  [x0], #8     // out[10]
-
-        fmla            v6.4s,  v4.4s,  v30.4s
-        fmls            v6.4s,  v5.4s,  v31.4s
-
-        fmla            v7.4s,  v4.4s,  v31.4s
-        fmla            v7.4s,  v5.4s,  v30.4s
-
-        zip1            v18.4s, v6.4s,  v7.4s
-        zip2            v19.4s, v6.4s,  v7.4s
-
-        st1             {v18.4s,v19.4s},  [x0], #32 // out[11-14]
-
-        ret             x9
-endfunc
-
-// x0: out, x1: out+len2, x2: exptab, x3: len2
-function fft15_pass
-        ands            x6,  x3,  #3
-        mov             x4,  x0
-        mov             x5,  x1
-        b.eq            9f
-        ld1             {v0.2s},  [x0], #8
-        ld1             {v1.2s},  [x1], #8
-        sub             x3,  x3,  x6
-        subs            x6,  x6,  #1
-        fadd            v2.2s,  v0.2s,  v1.2s
-        fsub            v3.2s,  v0.2s,  v1.2s
-        add             x2,  x2,  #8
-        st1             {v2.2s},  [x4], #8
-        st1             {v3.2s},  [x5], #8
-        b.eq            9f
-1:
-        subs            x6,  x6,  #1
-        ldp             s4,  s5,  [x2], #8
-        ldp             s2,  s3,  [x1], #8
-        ldp             s0,  s1,  [x0], #8
-
-        fmul            s6,  s2,  s4
-        fmul            s7,  s2,  s5
-        fmls            s6,  s3,  v5.s[0]
-        fmla            s7,  s3,  v4.s[0]
-
-        fsub            s2,  s0,  s6
-        fsub            s3,  s1,  s7
-        fadd            s0,  s0,  s6
-        fadd            s1,  s1,  s7
-
-        stp             s2,  s3,  [x5], #8
-        stp             s0,  s1,  [x4], #8
-        b.gt            1b
-9:
-        ld1             {v4.4s,v5.4s}, [x2],  #32
-        ld2             {v2.4s,v3.4s}, [x1],  #32
-        uzp1            v6.4s,  v4.4s,  v5.4s
-        uzp2            v7.4s,  v4.4s,  v5.4s
-        ld2             {v0.4s,v1.4s}, [x0],  #32
-8:
-        subs            x3,  x3,  #8
-
-        fmul            v4.4s,  v2.4s,  v6.4s
-        fmul            v5.4s,  v2.4s,  v7.4s
-        b.lt            4f
-
-        ld1             {v18.4s,v19.4s}, [x2],  #32
-
-        fmls            v4.4s,  v3.4s,  v7.4s
-        fmla            v5.4s,  v3.4s,  v6.4s
-
-        ld2             {v22.4s,v23.4s}, [x1],  #32
-
-        fsub            v2.4s,  v0.4s,  v4.4s
-        fadd            v0.4s,  v0.4s,  v4.4s
-        fsub            v3.4s,  v1.4s,  v5.4s
-        fadd            v1.4s,  v1.4s,  v5.4s
-
-        uzp1            v16.4s, v18.4s, v19.4s
-        uzp2            v17.4s, v18.4s, v19.4s
-
-        st2             {v2.4s,v3.4s}, [x5],  #32
-        st2             {v0.4s,v1.4s}, [x4],  #32
-        ld2             {v20.4s,v21.4s}, [x0],  #32
-
-        fmul            v18.4s, v22.4s, v16.4s
-        fmul            v19.4s, v22.4s, v17.4s
-        b.eq            0f
-
-        ld1             {v4.4s,v5.4s}, [x2],  #32
-
-        fmls            v18.4s, v23.4s, v17.4s
-        fmla            v19.4s, v23.4s, v16.4s
-
-        ld2             {v2.4s,v3.4s}, [x1],  #32
-
-        fsub            v22.4s, v20.4s, v18.4s
-        fadd            v20.4s, v20.4s, v18.4s
-        fsub            v23.4s, v21.4s, v19.4s
-        fadd            v21.4s, v21.4s, v19.4s
-
-        uzp1            v6.4s,  v4.4s,  v5.4s
-        uzp2            v7.4s,  v4.4s,  v5.4s
-
-        st2             {v22.4s,v23.4s}, [x5],  #32
-        st2             {v20.4s,v21.4s}, [x4],  #32
-        ld2             {v0.4s,v1.4s}, [x0],  #32
-
-        b               8b
-4:
-        fmls            v4.4s,  v3.4s,  v7.4s
-        fmla            v5.4s,  v3.4s,  v6.4s
-
-        fsub            v2.4s,  v0.4s,  v4.4s
-        fadd            v0.4s,  v0.4s,  v4.4s
-        fsub            v3.4s,  v1.4s,  v5.4s
-        fadd            v1.4s,  v1.4s,  v5.4s
-
-        st2             {v2.4s,v3.4s}, [x5],  #32
-        st2             {v0.4s,v1.4s}, [x4],  #32
-
-        ret
-0:
-        fmls            v18.4s, v23.4s, v17.4s
-        fmla            v19.4s, v23.4s, v16.4s
-
-        fsub            v22.4s, v20.4s, v18.4s
-        fadd            v20.4s, v20.4s, v18.4s
-        fsub            v23.4s, v21.4s, v19.4s
-        fadd            v21.4s, v21.4s, v19.4s
-
-        st2             {v22.4s,v23.4s}, [x5],  #32
-        st2             {v20.4s,v21.4s}, [x4],  #32
-
-        ret
-endfunc
-
-function fft30_neon, align=6
-        sub             sp,  sp,  #0x20
-        stp             x20, x21, [sp]
-        stp             x22, x30, [sp, #0x10]
-        mov             x21, x1
-        mov             x22, x2
-        mov             x20, x4
-        mov             x0,  x21
-        mov             x1,  x22
-        lsl             x3,  x20, #1
-        bl              fft15_neon
-
-        add             x0,  x21, #15*8
-        add             x1,  x22, x20,  lsl #3
-        lsl             x3,  x20, #1
-        bl              fft15_neon
-
-        ldr             x2,  [x10, #(CELT_EXPTAB + 8)]  // s->exptab[1]
-        add             x0,  x21, #0
-        add             x1,  x21, #15*8
-        mov             x3,  #15
-        ldp             x20, x21, [sp]
-        ldp             x22, x30, [sp, #0x10]
-        add             sp,  sp,  #0x20
-        b               fft15_pass
-endfunc
-
-.macro  def_fft n, n2
-function fft\n\()_neon, align=6
-        sub             sp,  sp,  #0x30
-        stp             x20, x21, [sp]
-        stp             x22, x30, [sp, #0x10]
-        stp             x23, x24, [sp, #0x20]
-        mov             x21, x1
-        mov             x22, x2
-        mov             x23, x3
-        mov             x20, x4
-        sub             x3,  x3,  #1
-        lsl             x4,  x4,  #1
-        bl              fft\n2\()_neon
-
-        add             x1,  x21, #(\n2 * 8)
-        add             x2,  x22, x20, lsl #3
-        sub             x3,  x23, #1
-        lsl             x4,  x20, #1
-        bl              fft\n2\()_neon
-
-        add             x5,  x10, #CELT_EXPTAB
-        mov             x0,  x21
-        ldr             x2,  [x5,  x23, lsl #3] // s->exptab[N]
-        add             x1,  x21, #(\n2 * 8)
-        mov             x3,  #\n2
-        ldp             x20, x21, [sp]
-        ldp             x22, x30, [sp, #0x10]
-        ldp             x23, x24, [sp, #0x20]
-        add             sp,  sp,  #0x30
-        b               fft15_pass
-endfunc
-.endm
-
-        def_fft    60,  30
-        def_fft   120,  60
-        def_fft   240, 120
-        def_fft   480, 240
-        def_fft   960, 480
-
-function fft_b15_calc_neon
-        sub             sp,  sp,  #0x50
-        ldr             x8,  [x0,  #CELT_EXPTAB]    // s->exptab[0]
-        movrel          x6,  fact5
-        movrel          x11, shuffle_0213
-        movrel          x12, shuffle_1032
-        movrel          x13, shuffle_2301
-        movrel          x14, shuffle_3120
-        add             x8,  x8,  #8
-        movrel          x5,  fft_tab_neon
-        stp             x20, x30, [sp]
-        stp             d8,  d9,  [sp, #0x10]
-        stp             d10, d11, [sp, #0x20]
-        stp             d12, d13, [sp, #0x30]
-        stp             d14, d15, [sp, #0x40]
-        ld1             {v15.4s}, [x6]
-        ld1             {v0.4s,v1.4s},   [x8],  #32
-        ld1             {v6.2s},  [x8],  #8
-        ld1             {v2.4s,v3.4s},   [x8],  #32
-        ld1             {v7.2s},  [x8],  #8
-        ld1             {v4.4s,v5.4s},   [x8],  #32
-        uzp1            v8.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].re
-        uzp2            v9.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].im
-        uzp1            v10.4s, v2.4s,  v3.4s   // exp[ 6 -  9].re
-        uzp2            v11.4s, v2.4s,  v3.4s   // exp[ 6 -  9].im
-        uzp1            v12.4s, v4.4s,  v5.4s   // exp[11 - 14].re
-        uzp2            v13.4s, v4.4s,  v5.4s   // exp[11 - 14].im
-        zip1            v14.4s, v6.4s,  v7.4s   // exp[5,10].re/exp[5,10].im
-        add             x5,  x5,  x3,  lsl #3
-        ldr             x5,  [x5]
-        mov             x10, x0
-        blr             x5
-        ldp             x20, x30, [sp]
-        ldp             d8,  d9,  [sp, #0x10]
-        ldp             d10, d11, [sp, #0x20]
-        ldp             d12, d13, [sp, #0x30]
-        ldp             d14, d15, [sp, #0x40]
-        add             sp,  sp,  #0x50
-        ret
-endfunc
-
-const   fft_tab_neon, relocate=1
-        .quad fft15_neon
-        .quad fft30_neon
-        .quad fft60_neon
-        .quad fft120_neon
-        .quad fft240_neon
-        .quad fft480_neon
-        .quad fft960_neon
-endconst
-
-function ff_celt_imdct_half_neon, export=1
-        sub             sp,  sp,  #0x20
-        stp             x21, x30, [sp]
-        str             s0, [sp, #0x10]
-
-        ldp             w5,  w6,  [x0,  #CELT_LEN2] // CELT_LEN4
-        mov             x10, x0
-        mov             x21, x1
-        sub             w5,  w5,  #1
-        lsl             x7,  x3,  #3            //  2 * stride * sizeof(float)
-        sub             x8,  xzr, x3,  lsl #3   // -2 * stride * sizeof(float)
-        mul             x5,  x5,  x3
-        ldp             x9,  x10, [x0,  #CELT_TMP]  // CELT_TWIDDLE
-        ldr             w3,  [x0, #CELT_FFT_N]
-        add             x5,  x2,  x5,  lsl #2
-        mov             x11, x9
-
-        sub             w6,  w6,  #4
-        ld1             {v0.s}[0],  [x5], x8
-        ld1             {v1.s}[0],  [x2], x7
-        ld1             {v4.4s,v5.4s}, [x10], #32
-        ld1             {v0.s}[1],  [x5], x8
-        ld1             {v1.s}[1],  [x2], x7
-        uzp1            v2.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[2],  [x5], x8
-        ld1             {v1.s}[2],  [x2], x7
-        uzp2            v3.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[3],  [x5], x8
-        ld1             {v1.s}[3],  [x2], x7
-1:
-        subs            w6,  w6,  #4
-
-        ld1             {v20.s}[0], [x5], x8
-        ld1             {v21.s}[0], [x2], x7
-        ld1             {v4.4s,v5.4s}, [x10], #32
-
-        fmul            v6.4s,  v0.4s,  v2.4s
-        fmul            v7.4s,  v0.4s,  v3.4s
-
-        ld1             {v20.s}[1], [x5], x8
-        ld1             {v21.s}[1], [x2], x7
-
-        fmls            v6.4s,  v1.4s,  v3.4s
-        fmla            v7.4s,  v1.4s,  v2.4s
-
-        ld1             {v20.s}[2], [x5], x8
-        ld1             {v21.s}[2], [x2], x7
-
-        uzp1            v2.4s,  v4.4s,  v5.4s
-        uzp2            v3.4s,  v4.4s,  v5.4s
-        ld1             {v20.s}[3], [x5], x8
-        ld1             {v21.s}[3], [x2], x7
-
-        zip1            v4.4s,  v6.4s,  v7.4s
-        zip2            v5.4s,  v6.4s,  v7.4s
-
-        fmul            v6.4s,  v20.4s, v2.4s
-        fmul            v7.4s,  v20.4s, v3.4s
-
-        st1             {v4.4s,v5.4s}, [x9], #32
-
-        fmls            v6.4s,  v21.4s, v3.4s
-        fmla            v7.4s,  v21.4s, v2.4s
-
-        b.eq            3f
-
-        subs            w6,  w6,  #4
-        ld1             {v4.4s,v5.4s}, [x10], #32
-        ld1             {v0.s}[0],  [x5], x8
-        ld1             {v1.s}[0],  [x2], x7
-        uzp1            v2.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[1],  [x5], x8
-        ld1             {v1.s}[1],  [x2], x7
-        uzp2            v3.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[2],  [x5], x8
-        ld1             {v1.s}[2],  [x2], x7
-        zip1            v4.4s,  v6.4s,  v7.4s
-        zip2            v5.4s,  v6.4s,  v7.4s
-        ld1             {v0.s}[3],  [x5], x8
-        ld1             {v1.s}[3],  [x2], x7
-
-        st1             {v4.4s,v5.4s}, [x9], #32
-
-        b.gt            1b
-
-        fmul            v6.4s,  v0.4s,  v2.4s
-        fmul            v7.4s,  v0.4s,  v3.4s
-        fmls            v6.4s,  v1.4s,  v3.4s
-        fmla            v7.4s,  v1.4s,  v2.4s
-3:
-        zip1            v4.4s,  v6.4s,  v7.4s
-        zip2            v5.4s,  v6.4s,  v7.4s
-        st1             {v4.4s,v5.4s}, [x9], #32
-
-        mov             x2,  x11
-        mov             x4,  #1
-
-        bl              fft_b15_calc_neon
-
-        ldr             w5,  [x10, #CELT_LEN4]
-        ldr             x6,  [x10, #CELT_TWIDDLE]
-        ldr             s31, [sp, #0x10]
-
-        add             x1,  x21, x5,  lsl #2
-        add             x3,  x6,  x5,  lsl #2
-        sub             x0,  x1,  #16
-        sub             x2,  x3,  #16
-        mov             x8,  #-16
-        mov             x7,  #16
-        mov             x10, x0
-        mov             x11, x1
-
-        sub             w5,  w5,  #4
-
-        ld1             {v0.4s},  [x0], x8
-        ld1             {v1.4s},  [x1], x7
-        ld1             {v2.4s},  [x2], x8
-        ld1             {v3.4s},  [x3], x7
-
-        uzp1            v4.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].re
-        uzp2            v6.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].im
-
-        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
-        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
-        fmul            v1.4s,  v6.4s,  v5.4s
-        fmul            v0.4s,  v6.4s,  v7.4s
-2:
-        subs            w5,  w5,  #4
-
-        ld1             {v20.4s}, [x0], x8
-
-        fmla            v1.4s,  v4.4s,  v7.4s
-        fmls            v0.4s,  v4.4s,  v5.4s
-
-        ld1             {v21.4s}, [x1], x7
-
-        ext             v1.16b, v1.16b, v1.16b, #8
-        fmul            v0.4s,  v0.4s,  v31.s[0]
-
-        ld1             {v2.4s},  [x2], x8
-
-        rev64           v1.4s,  v1.4s
-        fmul            v1.4s,  v1.4s,  v31.s[0]
-
-        ld1             {v3.4s},  [x3], x7
-
-        zip1            v5.4s,  v0.4s,  v1.4s
-        zip2            v7.4s,  v0.4s,  v1.4s
-
-        uzp1            v4.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].re
-        uzp2            v6.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].im
-
-        st1             {v5.4s},  [x10], x8
-        st1             {v7.4s},  [x11], x7
-
-        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
-        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
-        fmul            v1.4s,  v6.4s,  v5.4s
-        fmul            v0.4s,  v6.4s,  v7.4s
-        b.gt            2b
-
-        fmla            v1.4s,  v4.4s,  v7.4s
-        fmls            v0.4s,  v4.4s,  v5.4s
-        ext             v1.16b, v1.16b, v1.16b, #8
-        fmul            v0.4s,  v0.4s,  v31.s[0]
-        rev64           v1.4s,  v1.4s
-        fmul            v1.4s,  v1.4s,  v31.s[0]
-        zip1            v5.4s,  v0.4s,  v1.4s
-        zip2            v7.4s,  v0.4s,  v1.4s
-        st1             {v5.4s},  [x10], x8
-        st1             {v7.4s},  [x11], x7
-
-        ldp             x21, x30, [sp]
-        add             sp,  sp,  #0x20
-        ret
-endfunc
-
-// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
-const   fact5,          align=4
-        .float           0.30901699437494745, 0.95105651629515353
-        .float          -0.80901699437494734, 0.58778525229247325
-endconst
diff --git a/libavcodec/aarch64/mdct_init.c b/libavcodec/aarch64/mdct_init.c
deleted file mode 100644
index 816111ab63..0000000000
--- a/libavcodec/aarch64/mdct_init.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/aarch64/cpu.h"
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-av_cold void ff_mdct_init_aarch64(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        s->imdct_calc   = ff_imdct_calc_neon;
-        s->imdct_half   = ff_imdct_half_neon;
-        s->mdct_calc    = ff_mdct_calc_neon;
-        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-    }
-}
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
index bccd8323fd..1fd199c972 100644
--- a/libavcodec/aarch64/mdct_neon.S
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mpegaudiodsp_init.c b/libavcodec/aarch64/mpegaudiodsp_init.c
index 849e310f62..5d966af5f4 100644
--- a/libavcodec/aarch64/mpegaudiodsp_init.c
+++ b/libavcodec/aarch64/mpegaudiodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S
index 2a36f67603..b6ef131228 100644
--- a/libavcodec/aarch64/mpegaudiodsp_neon.S
+++ b/libavcodec/aarch64/mpegaudiodsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 377009e244..0fddbecae3 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/neontest.c b/libavcodec/aarch64/neontest.c
index 201bfb1ce7..a24c22dd30 100644
--- a/libavcodec/aarch64/neontest.c
+++ b/libavcodec/aarch64/neontest.c
@@ -2,20 +2,20 @@
  * check NEON registers for clobbers
  * Copyright (c) 2013 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/rv40dsp_init_aarch64.c b/libavcodec/aarch64/rv40dsp_init_aarch64.c
index f7fcd5b493..142705db98 100644
--- a/libavcodec/aarch64/rv40dsp_init_aarch64.c
+++ b/libavcodec/aarch64/rv40dsp_init_aarch64.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/sbrdsp_init_aarch64.c b/libavcodec/aarch64/sbrdsp_init_aarch64.c
new file mode 100644
index 0000000000..9c967990df
--- /dev/null
+++ b/libavcodec/aarch64/sbrdsp_init_aarch64.c
@@ -0,0 +1,70 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/sbrdsp.h"
+
+void ff_sbr_sum64x5_neon(float *z);
+float ff_sbr_sum_square_neon(float (*x)[2], int n);
+void ff_sbr_neg_odd_64_neon(float *x);
+void ff_sbr_qmf_pre_shuffle_neon(float *z);
+void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
+void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
+void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
+                           const float *g_filt, int m_max, intptr_t ixh);
+void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
+                        const float alpha0[2], const float alpha1[2],
+                        float bw, int start, int end);
+void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
+void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+
+av_cold void ff_sbrdsp_init_aarch64(SBRDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->sum64x5 = ff_sbr_sum64x5_neon;
+        s->sum_square = ff_sbr_sum_square_neon;
+        s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
+        s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
+        s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
+        s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
+        s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
+        s->hf_g_filt = ff_sbr_hf_g_filt_neon;
+        s->hf_gen = ff_sbr_hf_gen_neon;
+        s->autocorrelate = ff_sbr_autocorrelate_neon;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
+    }
+}
diff --git a/libavcodec/aarch64/sbrdsp_neon.S b/libavcodec/aarch64/sbrdsp_neon.S
new file mode 100644
index 0000000000..d23717e760
--- /dev/null
+++ b/libavcodec/aarch64/sbrdsp_neon.S
@@ -0,0 +1,327 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const factors, align=4
+        .float 1.0, -1.0, 1.0, -1.0
+endconst
+
+const phi_noise_0, align=4
+        .float 1.0, 0.0, 1.0, 0.0
+endconst
+
+const phi_noise_1, align=4
+        .float 0.0,  1.0,  0.0, -1.0
+        .float 0.0, -1.0,  0.0,  1.0
+endconst
+
+const phi_noise_2, align=4
+        .float -1.0, 0.0, -1.0, 0.0
+endconst
+
+const phi_noise_3, align=4
+        .float 0.0, -1.0,  0.0,  1.0
+        .float 0.0,  1.0,  0.0, -1.0
+endconst
+
+function ff_sbr_sum64x5_neon, export=1
+        add             x1, x0, #64*4
+        add             x2, x0, #128*4
+        add             x3, x0, #192*4
+        add             x4, x0, #256*4
+        mov             x5, #64
+1:      ld1             {v0.4S}, [x0]
+        ld1             {v1.4S}, [x1], #16
+        fadd            v0.4S, v0.4S, v1.4S
+        ld1             {v2.4S}, [x2], #16
+        fadd            v0.4S, v0.4S, v2.4S
+        ld1             {v3.4S}, [x3], #16
+        fadd            v0.4S, v0.4S, v3.4S
+        ld1             {v4.4S}, [x4], #16
+        fadd            v0.4S, v0.4S, v4.4S
+        st1             {v0.4S}, [x0], #16
+        subs            x5, x5, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_sum_square_neon, export=1
+        movi            v0.4S, #0
+1:      ld1             {v1.4S}, [x0], #16
+        fmla            v0.4S, v1.4S, v1.4S
+        subs            w1, w1, #2
+        b.gt            1b
+        faddp           v0.4S, v0.4S, v0.4S
+        faddp           v0.4S, v0.4S, v0.4S
+        ret
+endfunc
+
+function ff_sbr_neg_odd_64_neon, export=1
+        mov             x1, x0
+        movi            v5.4S, #1<<7, lsl #24
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
+.rept 3
+        st2             {v0.4S, v1.4S}, [x1], #32
+        eor             v3.16B, v3.16B, v5.16B
+        ld2             {v0.4S, v1.4S}, [x0], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
+        eor             v1.16B, v1.16B, v5.16B
+        ld2             {v2.4S, v3.4S}, [x0], #32
+.endr
+        eor             v3.16B, v3.16B, v5.16B
+        st2             {v0.4S, v1.4S}, [x1], #32
+        st2             {v2.4S, v3.4S}, [x1], #32
+        ret
+endfunc
+
+function ff_sbr_qmf_pre_shuffle_neon, export=1
+        add             x1, x0, #60*4
+        add             x2, x0, #64*4
+        mov             x3, #-16
+        mov             x4, #-4
+        movi            v6.4S, #1<<7, lsl #24
+        ld1             {v0.2S}, [x0], #8
+        st1             {v0.2S}, [x2], #8
+.rept 7
+        ld1             {v1.4S}, [x1], x3
+        ld1             {v2.4S}, [x0], #16
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st2             {v1.4S, v2.4S}, [x2], #32
+.endr
+        add             x1, x1, #8
+        ld1             {v1.2S}, [x1], x4
+        ld1             {v2.2S}, [x0], #8
+        ld1             {v1.S}[3], [x1]
+        ld1             {v2.S}[2], [x0]
+        eor             v1.16B, v1.16B, v6.16B
+        rev64           v1.4S, v1.4S
+        st2             {v1.2S, v2.2S}, [x2], #16
+        st2             {v1.S, v2.S}[2], [x2]
+        ret
+endfunc
+
+function ff_sbr_qmf_post_shuffle_neon, export=1
+        add             x2, x1, #60*4
+        mov             x3, #-16
+        mov             x4, #32
+        movi            v6.4S, #1<<7, lsl #24
+1:      ld1             {v0.4S}, [x2], x3
+        ld1             {v1.4S}, [x1], #16
+        eor             v0.16B, v0.16B, v6.16B
+        rev64           v0.4S, v0.4S
+        ext             v0.16B, v0.16B, v0.16B, #8
+        st2             {v0.4S, v1.4S}, [x0], #32
+        subs            x4, x4, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_qmf_deint_neg_neon, export=1
+        add             x1, x1, #56*4
+        add             x2, x0, #60*4
+        mov             x3, #-32
+        mov             x4, #32
+        movi            v2.4S, #1<<7, lsl #24
+1:      ld2             {v0.4S, v1.4S}, [x1], x3
+        eor             v0.16B, v0.16B, v2.16B
+        rev64           v1.4S, v1.4S
+        ext             v1.16B, v1.16B, v1.16B, #8
+        st1             {v0.4S}, [x2]
+        st1             {v1.4S}, [x0], #16
+        sub             x2, x2, #16
+        subs            x4, x4, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_qmf_deint_bfly_neon, export=1
+        add             x2, x2, #60*4
+        add             x3, x0, #124*4
+        mov             x4, #64
+        mov             x5, #-16
+1:      ld1             {v0.4S}, [x1], #16
+        ld1             {v1.4S}, [x2], x5
+        rev64           v2.4S, v0.4S
+        ext             v2.16B, v2.16B, v2.16B, #8
+        rev64           v3.4S, v1.4S
+        ext             v3.16B, v3.16B, v3.16B, #8
+        fadd            v1.4S, v1.4S, v2.4S
+        fsub            v0.4S, v0.4S, v3.4S
+        st1             {v0.4S}, [x0], #16
+        st1             {v1.4S}, [x3], x5
+        subs            x4, x4, #4
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_hf_gen_neon, export=1
+        sxtw            x4, w4
+        sxtw            x5, w5
+        movrel          x6, factors
+        ld1             {v7.4S}, [x6]
+        dup             v1.4S, v0.S[0]
+        mov             v2.8B, v1.8B
+        mov             v2.S[2], v7.S[0]
+        mov             v2.S[3], v7.S[0]
+        fmul            v1.4S, v1.4S, v2.4S
+        ld1             {v0.D}[0], [x3]
+        ld1             {v0.D}[1], [x2]
+        fmul            v0.4S, v0.4S, v1.4S
+        fmul            v1.4S, v0.4S, v7.4S
+        rev64           v0.4S, v0.4S
+        sub             x7, x5, x4
+        add             x0, x0, x4, lsl #3
+        add             x1, x1, x4, lsl #3
+        sub             x1, x1, #16
+1:      ld1             {v2.4S}, [x1], #16
+        ld1             {v3.2S}, [x1]
+        fmul            v4.4S, v2.4S, v1.4S
+        fmul            v5.4S, v2.4S, v0.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        faddp           v4.4S, v4.4S, v4.4S
+        faddp           v5.4S, v5.4S, v5.4S
+        mov             v4.S[1], v5.S[0]
+        fadd            v4.2S, v4.2S, v3.2S
+        st1             {v4.2S}, [x0], #8
+        sub             x1, x1, #8
+        subs            x7, x7, #1
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_hf_g_filt_neon, export=1
+        sxtw            x3, w3
+        sxtw            x4, w4
+        mov             x5, #40*2*4
+        add             x1, x1, x4, lsl #3
+1:      ld1             {v0.2S}, [x1], x5
+        ld1             {v1.S}[0], [x2], #4
+        fmul            v2.4S, v0.4S, v1.S[0]
+        st1             {v2.2S}, [x0], #8
+        subs            x3, x3, #1
+        b.gt            1b
+        ret
+endfunc
+
+function ff_sbr_autocorrelate_neon, export=1
+        mov             x2, #38
+        movrel          x3, factors
+        ld1             {v0.4S}, [x3]
+        movi            v1.4S, #0
+        movi            v2.4S, #0
+        movi            v3.4S, #0
+        ld1             {v4.2S}, [x0], #8
+        ld1             {v5.2S}, [x0], #8
+        fmul            v16.2S, v4.2S, v4.2S
+        fmul            v17.2S, v5.2S, v4.S[0]
+        fmul            v18.2S, v5.2S, v4.S[1]
+1:      ld1             {v5.D}[1], [x0], #8
+        fmla            v1.2S, v4.2S, v4.2S
+        fmla            v2.4S, v5.4S, v4.S[0]
+        fmla            v3.4S, v5.4S, v4.S[1]
+        mov             v4.D[0], v5.D[0]
+        mov             v5.D[0], v5.D[1]
+        subs            x2, x2, #1
+        b.gt            1b
+        fmul            v19.2S, v4.2S, v4.2S
+        fmul            v20.2S, v5.2S, v4.S[0]
+        fmul            v21.2S, v5.2S, v4.S[1]
+        fadd            v22.4S, v2.4S, v20.4S
+        fsub            v22.4S, v22.4S, v17.4S
+        fadd            v23.4S, v3.4S, v21.4S
+        fsub            v23.4S, v23.4S, v18.4S
+        rev64           v23.4S, v23.4S
+        fmul            v23.4S, v23.4S, v0.4S
+        fadd            v22.4S, v22.4S, v23.4S
+        st1             {v22.4S}, [x1], #16
+        fadd            v23.2S, v1.2S, v19.2S
+        fsub            v23.2S, v23.2S, v16.2S
+        faddp           v23.2S, v23.2S, v23.2S
+        st1             {v23.S}[0], [x1]
+        add             x1, x1, #8
+        rev64           v3.2S, v3.2S
+        fmul            v3.2S, v3.2S, v0.2S
+        fadd            v2.2S, v2.2S, v3.2S
+        st1             {v2.2S}, [x1]
+        add             x1, x1, #16
+        faddp           v1.2S, v1.2S, v1.2S
+        st1             {v1.S}[0], [x1]
+        ret
+endfunc
+
+.macro apply_noise_common
+        sxtw            x3, w3
+        sxtw            x5, w5
+        movrel          x7, X(ff_sbr_noise_table)
+        add             x3, x3, #1
+1:      and             x3, x3, #0x1ff
+        add             x8, x7, x3, lsl #3
+        add             x3, x3, #2
+        ld1             {v2.4S}, [x0]
+        ld1             {v3.2S}, [x1], #8
+        ld1             {v4.2S}, [x2], #8
+        ld1             {v5.4S}, [x8]
+        mov             v6.16B, v2.16B
+        zip1            v3.4S, v3.4S, v3.4S
+        zip1            v4.4S, v4.4S, v4.4S
+        fmla            v6.4S, v1.4S, v3.4S
+        fmla            v2.4S, v5.4S, v4.4S
+        fcmeq           v7.4S, v3.4S, #0
+        bif             v2.16B, v6.16B, v7.16B
+        st1             {v2.4S}, [x0], #16
+        subs            x5, x5, #2
+        b.gt            1b
+.endm
+
+function ff_sbr_hf_apply_noise_0_neon, export=1
+        movrel          x9, phi_noise_0
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
+
+function ff_sbr_hf_apply_noise_1_neon, export=1
+        movrel          x9, phi_noise_1
+        and             x4, x4, #1
+        add             x9, x9, x4, lsl #4
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
+
+function ff_sbr_hf_apply_noise_2_neon, export=1
+        movrel          x9, phi_noise_2
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
+
+function ff_sbr_hf_apply_noise_3_neon, export=1
+        movrel          x9, phi_noise_3
+        and             x4, x4, #1
+        add             x9, x9, x4, lsl #4
+        ld1             {v1.4S}, [x9]
+        apply_noise_common
+        ret
+endfunc
diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S
new file mode 100644
index 0000000000..5e4d021a97
--- /dev/null
+++ b/libavcodec/aarch64/simple_idct_neon.S
@@ -0,0 +1,362 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define Z1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4c ((1<<(COL_SHIFT-1))/Z4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define z1 v0.H[0]
+#define z2 v0.H[1]
+#define z3 v0.H[2]
+#define z4 v0.H[3]
+#define z5 v0.H[4]
+#define z6 v0.H[5]
+#define z7 v0.H[6]
+#define z4c v0.H[7]
+
+const   idct_coeff_neon, align=4
+        .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
+endconst
+
+.macro idct_start data
+        prfm            pldl1keep, [\data]
+        mov             x10, x30
+        movrel          x3, idct_coeff_neon
+        ld1             {v0.2D}, [x3]
+.endm
+
+.macro idct_end
+        br              x10
+.endm
+
+.macro smull1 a, b, c
+        smull           \a, \b, \c
+.endm
+
+.macro smlal1 a, b, c
+        smlal           \a, \b, \c
+.endm
+
+.macro smlsl1 a, b, c
+        smlsl           \a, \b, \c
+.endm
+
+.macro idct_col4_top y1, y2, y3, y4, i, l
+        smull\i         v7.4S,  \y3\l, z2
+        smull\i         v16.4S, \y3\l, z6
+        smull\i         v17.4S, \y2\l, z1
+        add             v19.4S, v23.4S, v7.4S
+        smull\i         v18.4S, \y2\l, z3
+        add             v20.4S, v23.4S, v16.4S
+        smull\i         v5.4S,  \y2\l, z5
+        sub             v21.4S, v23.4S, v16.4S
+        smull\i         v6.4S,  \y2\l, z7
+        sub             v22.4S, v23.4S, v7.4S
+
+        smlal\i         v17.4S, \y4\l, z3
+        smlsl\i         v18.4S, \y4\l, z7
+        smlsl\i         v5.4S,  \y4\l, z1
+        smlsl\i         v6.4S,  \y4\l, z5
+.endm
+
+.macro idct_row4_neon y1, y2, y3, y4, pass
+        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
+        movi            v23.4S, #1<<2, lsl #8
+        orr             v5.16B, \y1\().16B, \y2\().16B
+        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
+        orr             v6.16B, \y3\().16B, \y4\().16B
+        orr             v5.16B, v5.16B, v6.16B
+        mov             x3, v5.D[1]
+        smlal           v23.4S, \y1\().4H, z4
+
+        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H
+
+        cmp             x3, #0
+        b.eq            \pass\()f
+
+        smull2          v7.4S, \y1\().8H, z4
+        smlal2          v17.4S, \y2\().8H, z5
+        smlsl2          v18.4S, \y2\().8H, z1
+        smull2          v16.4S, \y3\().8H, z2
+        smlal2          v5.4S, \y2\().8H, z7
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+        smlal2          v6.4S, \y2\().8H, z3
+        smull2          v7.4S, \y3\().8H, z6
+        smlal2          v17.4S, \y4\().8H, z7
+        smlsl2          v18.4S, \y4\().8H, z5
+        smlal2          v5.4S, \y4\().8H, z3
+        smlsl2          v6.4S, \y4\().8H, z1
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+        sub             v22.4S, v22.4S, v7.4S
+
+\pass:  add             \y3\().4S, v19.4S, v17.4S
+        add             \y4\().4S, v20.4S, v18.4S
+        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
+        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
+        add             v7.4S, v21.4S, v5.4S
+        add             v16.4S, v22.4S, v6.4S
+        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
+        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
+        sub             v22.4S, v22.4S, v6.4S
+        sub             v19.4S, v19.4S, v17.4S
+        sub             v21.4S, v21.4S, v5.4S
+        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
+        sub             v20.4S, v20.4S, v18.4S
+        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
+        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
+        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
+
+        trn1            v16.8H, \y1\().8H, \y2\().8H
+        trn2            v17.8H, \y1\().8H, \y2\().8H
+        trn1            v18.8H, \y3\().8H, \y4\().8H
+        trn2            v19.8H, \y3\().8H, \y4\().8H
+        trn1            \y1\().4S, v16.4S, v18.4S
+        trn1            \y2\().4S, v17.4S, v19.4S
+        trn2            \y3\().4S, v16.4S, v18.4S
+        trn2            \y4\().4S, v17.4S, v19.4S
+.endm
+
+.macro declare_idct_col4_neon i, l
+function idct_col4_neon\i
+        dup             v23.4H, z4c
+.if \i == 1
+        add             v23.4H, v23.4H, v24.4H
+.else
+        mov             v5.D[0], v24.D[1]
+        add             v23.4H, v23.4H, v5.4H
+.endif
+        smull           v23.4S, v23.4H, z4
+
+        idct_col4_top   v24, v25, v26, v27, \i, \l
+
+        mov             x4, v28.D[\i - 1]
+        mov             x5, v29.D[\i - 1]
+        cmp             x4, #0
+        b.eq            1f
+
+        smull\i         v7.4S,  v28\l,  z4
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+
+1:      mov             x4, v30.D[\i - 1]
+        cmp             x5, #0
+        b.eq            2f
+
+        smlal\i         v17.4S, v29\l, z5
+        smlsl\i         v18.4S, v29\l, z1
+        smlal\i         v5.4S,  v29\l, z7
+        smlal\i         v6.4S,  v29\l, z3
+
+2:      mov             x5, v31.D[\i - 1]
+        cmp             x4, #0
+        b.eq            3f
+
+        smull\i         v7.4S,  v30\l, z6
+        smull\i         v16.4S, v30\l, z2
+        add             v19.4S, v19.4S, v7.4S
+        sub             v22.4S, v22.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+
+3:      cmp             x5, #0
+        b.eq            4f
+
+        smlal\i         v17.4S, v31\l, z7
+        smlsl\i         v18.4S, v31\l, z5
+        smlal\i         v5.4S,  v31\l, z3
+        smlsl\i         v6.4S,  v31\l, z1
+
+4:      addhn           v7.4H, v19.4S, v17.4S
+        addhn2          v7.8H, v20.4S, v18.4S
+        subhn           v18.4H, v20.4S, v18.4S
+        subhn2          v18.8H, v19.4S, v17.4S
+
+        addhn           v16.4H, v21.4S, v5.4S
+        addhn2          v16.8H, v22.4S, v6.4S
+        subhn           v17.4H, v22.4S, v6.4S
+        subhn2          v17.8H, v21.4S, v5.4S
+
+        ret
+endfunc
+.endm
+
+declare_idct_col4_neon 1, .4H
+declare_idct_col4_neon 2, .8H
+
+function ff_simple_idct_put_neon, export=1
+        idct_start      x2
+
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        bl              idct_col4_neon1
+
+        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
+
+        zip1            v16.4S, v1.4S, v2.4S
+        zip2            v17.4S, v1.4S, v2.4S
+
+        st1             {v16.D}[0], [x0], x1
+        st1             {v16.D}[1], [x0], x1
+
+        zip1            v18.4S, v3.4S, v4.4S
+        zip2            v19.4S, v3.4S, v4.4S
+
+        st1             {v17.D}[0], [x0], x1
+        st1             {v17.D}[1], [x0], x1
+        st1             {v18.D}[0], [x0], x1
+        st1             {v18.D}[1], [x0], x1
+        st1             {v19.D}[0], [x0], x1
+        st1             {v19.D}[1], [x0], x1
+
+        idct_end
+endfunc
+
+function ff_simple_idct_add_neon, export=1
+        idct_start      x2
+
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        bl              idct_col4_neon1
+
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+
+        mov             x9,  x0
+        ld1             {v19.D}[0], [x0], x1
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        ld1             {v19.D}[1], [x0], x1
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        ld1             {v20.D}[0], [x0], x1
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        ld1             {v20.D}[1], [x0], x1
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        ld1             {v21.D}[0], [x0], x1
+        uaddw           v23.8H, v23.8H, v19.8B
+        uaddw2          v24.8H, v24.8H, v19.16B
+        ld1             {v21.D}[1], [x0], x1
+        sqxtun          v23.8B, v23.8H
+        sqxtun2         v23.16B, v24.8H
+        ld1             {v22.D}[0], [x0], x1
+        uaddw           v24.8H, v25.8H, v20.8B
+        uaddw2          v25.8H, v26.8H, v20.16B
+        ld1             {v22.D}[1], [x0], x1
+        sqxtun          v24.8B, v24.8H
+        sqxtun2         v24.16B, v25.8H
+        st1             {v23.D}[0], [x9], x1
+        uaddw           v25.8H, v27.8H, v21.8B
+        uaddw2          v26.8H, v28.8H, v21.16B
+        st1             {v23.D}[1], [x9], x1
+        sqxtun          v25.8B, v25.8H
+        sqxtun2         v25.16B, v26.8H
+        st1             {v24.D}[0], [x9], x1
+        uaddw           v26.8H, v29.8H, v22.8B
+        uaddw2          v27.8H, v30.8H, v22.16B
+        st1             {v24.D}[1], [x9], x1
+        sqxtun          v26.8B, v26.8H
+        sqxtun2         v26.16B, v27.8H
+        st1             {v25.D}[0], [x9], x1
+        st1             {v25.D}[1], [x9], x1
+        st1             {v26.D}[0], [x9], x1
+        st1             {v26.D}[1], [x9], x1
+
+        idct_end
+endfunc
+
+function ff_simple_idct_neon, export=1
+        idct_start      x0
+
+        mov             x2,  x0
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        sub             x2, x2, #128
+        bl              idct_col4_neon1
+
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        st1             {v23.2D,v24.2D}, [x2], #32
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        st1             {v25.2D,v26.2D}, [x2], #32
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        st1             {v27.2D,v28.2D}, [x2], #32
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        st1             {v29.2D,v30.2D}, [x2], #32
+
+        idct_end
+endfunc
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/synth_filter_init.c
index d3430d045c..767b01112a 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/synth_filter_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,8 +23,8 @@
 #include "libavutil/aarch64/cpu.h"
 #include "libavutil/attributes.h"
 #include "libavutil/internal.h"
-#include "libavcodec/dcadsp.h"
 #include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
 
 #include "asm-offsets.h"
 
@@ -32,25 +32,12 @@
 AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
 #endif
 
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
 void ff_synth_filter_float_neon(FFTContext *imdct,
                                 float *synth_buf_ptr, int *synth_buf_offset,
                                 float synth_buf2[32], const float window[512],
                                 float out[32], const float in[32],
                                 float scale);
 
-av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
-        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
-    }
-}
-
 av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S
index b001c737da..8fcd71f252 100644
--- a/libavcodec/aarch64/synth_filter_neon.S
+++ b/libavcodec/aarch64/synth_filter_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index ab97a97740..13dfd74940 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/videodsp.S b/libavcodec/aarch64/videodsp.S
index 7ce5a7ddf6..24067cc2af 100644
--- a/libavcodec/aarch64/videodsp.S
+++ b/libavcodec/aarch64/videodsp.S
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/videodsp_init.c b/libavcodec/aarch64/videodsp_init.c
index 59b697d4f4..6f667a6d3e 100644
--- a/libavcodec/aarch64/videodsp_init.c
+++ b/libavcodec/aarch64/videodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vorbisdsp_init.c b/libavcodec/aarch64/vorbisdsp_init.c
index 3559b54a30..c796f95e61 100644
--- a/libavcodec/aarch64/vorbisdsp_init.c
+++ b/libavcodec/aarch64/vorbisdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vorbisdsp_neon.S b/libavcodec/aarch64/vorbisdsp_neon.S
index 11f71f1d89..e76feebc54 100644
--- a/libavcodec/aarch64/vorbisdsp_neon.S
+++ b/libavcodec/aarch64/vorbisdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vp8dsp.h b/libavcodec/aarch64/vp8dsp.h
new file mode 100644
index 0000000000..ea7665dcc8
--- /dev/null
+++ b/libavcodec/aarch64/vp8dsp.h
@@ -0,0 +1,70 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_VP8DSP_H
+#define AVCODEC_AARCH64_VP8DSP_H
+
+#include "libavcodec/vp8dsp.h"
+
+#define VP8_LF_Y(hv, inner, opt)                                             \
+    void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst,            \
+                                                    ptrdiff_t stride,        \
+                                                    int flim_E, int flim_I,  \
+                                                    int hev_thresh)
+
+#define VP8_LF_UV(hv, inner, opt)                                            \
+    void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU,          \
+                                                     uint8_t *dstV,          \
+                                                     ptrdiff_t stride,       \
+                                                     int flim_E, int flim_I, \
+                                                     int hev_thresh)
+
+#define VP8_LF_SIMPLE(hv, opt)                                          \
+    void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst,         \
+                                                  ptrdiff_t stride,     \
+                                                  int flim)
+
+#define VP8_LF_HV(inner, opt)                   \
+    VP8_LF_Y(h,  inner, opt);                   \
+    VP8_LF_Y(v,  inner, opt);                   \
+    VP8_LF_UV(h, inner, opt);                   \
+    VP8_LF_UV(v, inner, opt)
+
+#define VP8_LF(opt)                             \
+    VP8_LF_HV(,       opt);                     \
+    VP8_LF_HV(_inner, opt);                     \
+    VP8_LF_SIMPLE(h, opt);                      \
+    VP8_LF_SIMPLE(v, opt)
+
+#define VP8_MC(n, opt)                                                  \
+    void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride,      \
+                                uint8_t *src, ptrdiff_t srcstride,      \
+                                int h, int x, int y)
+
+#define VP8_EPEL(w, opt)                        \
+    VP8_MC(pixels ## w, opt);                   \
+    VP8_MC(epel ## w ## _h4, opt);              \
+    VP8_MC(epel ## w ## _h6, opt);              \
+    VP8_MC(epel ## w ## _v4, opt);              \
+    VP8_MC(epel ## w ## _h4v4, opt);            \
+    VP8_MC(epel ## w ## _h6v4, opt);            \
+    VP8_MC(epel ## w ## _v6, opt);              \
+    VP8_MC(epel ## w ## _h4v6, opt);            \
+    VP8_MC(epel ## w ## _h6v6, opt)
+
+#endif /* AVCODEC_AARCH64_VP8DSP_H */
diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c
new file mode 100644
index 0000000000..dbc07408a0
--- /dev/null
+++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c
@@ -0,0 +1,77 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp.h"
+
+void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
+
+void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+
+VP8_LF(neon);
+
+VP8_EPEL(16, neon);
+VP8_EPEL(8,  neon);
+
+
+av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp)
+{
+    if (!have_neon(av_get_cpu_flags())) {
+        return;
+    }
+    dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
+    dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
+    dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
+    dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
+
+    dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
+    dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
+    dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
+    dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
+    dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
+}
+
+av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp)
+{
+    if (!have_neon(av_get_cpu_flags())) {
+        return;
+    }
+
+    dsp->vp8_idct_add       = ff_vp8_idct_add_neon;
+    dsp->vp8_idct_dc_add    = ff_vp8_idct_dc_add_neon;
+    dsp->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_neon;
+
+    dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
+    dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
+    dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
+    dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
+
+    dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
+    dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
+    dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
+    dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
+
+    dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
+    dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
+}
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S
new file mode 100644
index 0000000000..0ce9e301de
--- /dev/null
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@@ -0,0 +1,1031 @@
+/*
+ * VP8 NEON optimisations
+ *
+ * Copyright (c) 2010 Rob Clark <rob@ti.com>
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+function ff_vp8_idct_add_neon, export=1
+        ld1             {v0.8b - v3.8b},  [x1]
+        mov             w4,  #20091
+        movk            w4,  #35468/2, lsl #16
+        dup             v4.2s, w4
+
+        smull           v26.4s, v1.4h,  v4.h[0]
+        smull           v27.4s, v3.4h,  v4.h[0]
+        sqdmulh         v20.4h, v1.4h,  v4.h[1]
+        sqdmulh         v23.4h, v3.4h,  v4.h[1]
+        sqshrn          v21.4h, v26.4s, #16
+        sqshrn          v22.4h, v27.4s, #16
+        add             v21.4h, v21.4h, v1.4h
+        add             v22.4h, v22.4h, v3.4h
+
+        add             v16.4h,  v0.4h,   v2.4h
+        sub             v17.4h,  v0.4h,   v2.4h
+
+        add             v18.4h,  v21.4h,  v23.4h
+        sub             v19.4h,  v20.4h,  v22.4h
+
+        add             v0.4h,   v16.4h,  v18.4h
+        add             v1.4h,   v17.4h,  v19.4h
+        sub             v3.4h,   v16.4h,  v18.4h
+        sub             v2.4h,   v17.4h,  v19.4h
+
+        transpose_4x4H  v0, v1, v2, v3, v24, v5, v6, v7
+
+        movi            v29.8h, #0
+        smull           v26.4s,     v1.4h,  v4.h[0]
+        st1             {v29.8h},   [x1],   #16
+        smull           v27.4s,     v3.4h,  v4.h[0]
+        st1             {v29.16b},  [x1]
+        sqdmulh         v21.4h,     v1.4h,  v4.h[1]
+        sqdmulh         v23.4h,     v3.4h,  v4.h[1]
+        sqshrn          v20.4h,     v26.4s, #16
+        sqshrn          v22.4h,     v27.4s, #16
+        add             v20.4h,     v20.4h, v1.4h
+        add             v22.4h,     v22.4h, v3.4h
+        add             v16.4h,     v0.4h,  v2.4h
+        sub             v17.4h,     v0.4h,  v2.4h
+
+        add             v18.4h,     v20.4h, v23.4h
+        ld1             {v24.d}[0], [x0],   x2
+        zip1            v16.2d,     v16.2d, v17.2d
+        sub             v19.4h,     v21.4h, v22.4h
+        ld1             {v25.d}[0], [x0],   x2
+        zip1            v18.2d,     v18.2d, v19.2d
+        add             v0.8h,      v16.8h, v18.8h
+        ld1             {v25.d}[1], [x0],   x2
+        sub             v1.8h,      v16.8h, v18.8h
+        ld1             {v24.d}[1], [x0],   x2
+        srshr           v0.8h,      v0.8h,  #3
+        trn1            v24.4s,     v24.4s, v25.4s
+        srshr           v1.8h,      v1.8h,  #3
+        sub             x0,  x0,  x2,  lsl #2
+
+        ext             v1.16b, v1.16b, v1.16b, #8
+        trn1            v3.2d,  v0.2d,  v1.2d
+        trn2            v0.2d,  v0.2d,  v1.2d
+        trn1            v1.8h,  v3.8h,  v0.8h
+        trn2            v3.8h,  v3.8h,  v0.8h
+        uzp1            v0.4s,  v1.4s,  v3.4s
+        uzp2            v1.4s,  v3.4s,  v1.4s
+
+        uaddw           v0.8h,  v0.8h, v24.8b
+        uaddw2          v1.8h,  v1.8h, v24.16b
+        sqxtun          v0.8b,  v0.8h
+        sqxtun2         v0.16b, v1.8h
+        st1             {v0.s}[0],  [x0], x2
+        st1             {v0.s}[1],  [x0], x2
+        st1             {v0.s}[3],  [x0], x2
+        st1             {v0.s}[2],  [x0], x2
+
+        ret
+endfunc
+
+function ff_vp8_idct_dc_add4y_neon, export=1
+        movi            v0.16b,  #0
+        mov             x3,  #32
+        ld1r            {v16.4h},    [x1]
+        st1             {v0.h}[0],   [x1], x3
+        ld1r            {v17.4h},    [x1]
+        st1             {v0.h}[0],   [x1], x3
+        zip1            v16.2d,      v16.2d, v17.2d
+        ld1r            {v18.4h},    [x1]
+        st1             {v0.h}[0],   [x1], x3
+        ld1r            {v19.4h},    [x1]
+        st1             {v0.h}[0],   [x1], x3
+        zip1            v18.2d,      v18.2d, v19.2d
+        srshr           v16.8h,      v16.8h,  #3            // dc >>= 3
+        ld1             {v0.16b},     [x0], x2
+        srshr           v18.8h,       v18.8h,  #3
+        ld1             {v1.16b},     [x0], x2
+        uaddw           v20.8h,       v16.8h,  v0.8b
+        ld1             {v2.16b},     [x0], x2
+        uaddw2          v0.8h,        v18.8h,   v0.16b
+        ld1             {v3.16b},     [x0], x2
+        uaddw           v21.8h, v16.8h,  v1.8b
+        uaddw2          v1.8h,  v18.8h,  v1.16b
+        uaddw           v22.8h, v16.8h,  v2.8b
+        uaddw2          v2.8h,  v18.8h,  v2.16b
+        uaddw           v23.8h, v16.8h,  v3.8b
+        uaddw2          v3.8h,  v18.8h,  v3.16b
+        sub             x0,  x0,  x2,  lsl #2
+        sqxtun          v20.8b,  v20.8h
+        sqxtun2         v20.16b, v0.8h
+        sqxtun          v21.8b,  v21.8h
+        sqxtun2         v21.16b, v1.8h
+        sqxtun          v22.8b,  v22.8h
+        st1             {v20.16b},    [x0], x2
+        sqxtun2         v22.16b, v2.8h
+        st1             {v21.16b},    [x0], x2
+        sqxtun          v23.8b,  v23.8h
+        st1             {v22.16b},    [x0], x2
+        sqxtun2         v23.16b, v3.8h
+        st1             {v23.16b},    [x0], x2
+
+        ret
+endfunc
+
+function ff_vp8_idct_dc_add_neon, export=1
+        mov             w3,       #0
+        ld1r            {v2.8h},  [x1]
+        strh            w3,       [x1]
+        srshr           v2.8h,  v2.8h,  #3
+        ld1             {v0.s}[0],  [x0], x2
+        ld1             {v0.s}[1],  [x0], x2
+        uaddw           v3.8h,  v2.8h,  v0.8b
+        ld1             {v1.s}[0],  [x0], x2
+        ld1             {v1.s}[1],  [x0], x2
+        uaddw           v4.8h,  v2.8h,  v1.8b
+        sqxtun          v0.8b,  v3.8h
+        sqxtun          v1.8b,  v4.8h
+        sub             x0,  x0,  x2, lsl #2
+        st1             {v0.s}[0],  [x0], x2
+        st1             {v0.s}[1],  [x0], x2
+        st1             {v1.s}[0],  [x0], x2
+        st1             {v1.s}[1],  [x0], x2
+        ret
+endfunc
+
+// Register layout:
+//   P3..Q3 -> v0..v7
+//   flim_E -> v22
+//   flim_I -> v23
+//   hev_thresh -> x5
+//
+.macro  vp8_loop_filter, inner=0, simple=0, hev_thresh
+    .if \simple
+        uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
+        uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
+        uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
+        ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
+        uqadd           v19.16b, v17.16b,  v18.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
+        movi            v21.16b, #0x80
+        cmhs            v16.16b, v22.16b, v19.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
+    .else
+        // calculate hev and normal_limit:
+        uabd            v20.16b, v2.16b,  v3.16b      // abs(P1-P0)
+        uabd            v21.16b, v5.16b,  v4.16b      // abs(Q1-Q0)
+        uabd            v18.16b, v0.16b,  v1.16b      // abs(P3-P2)
+        uabd            v19.16b, v1.16b,  v2.16b      // abs(P2-P1)
+        cmhs            v16.16b, v23.16b, v20.16b     // abs(P1-P0) <= flim_I
+        cmhs            v17.16b, v23.16b, v21.16b     // abs(Q1-Q0) <= flim_I
+        cmhs            v18.16b, v23.16b, v18.16b     // abs(P3-P2) <= flim_I
+        cmhs            v19.16b, v23.16b, v19.16b     // abs(P2-P1) <= flim_I
+        and             v16.16b, v17.16b, v16.16b
+        uabd            v17.16b, v7.16b,  v6.16b      // abs(Q3-Q2)
+        and             v16.16b, v16.16b, v19.16b
+        uabd            v19.16b, v6.16b,  v5.16b      // abs(Q2-Q1)
+        and             v16.16b, v16.16b, v18.16b
+        cmhs            v18.16b, v23.16b, v17.16b     // abs(Q3-Q2) <= flim_I
+        cmhs            v19.16b, v23.16b, v19.16b     // abs(Q2-Q1) <= flim_I
+        uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
+        uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
+        and             v16.16b, v16.16b, v18.16b
+        uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
+        and             v16.16b, v16.16b, v19.16b
+        ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
+        dup             v23.16b, \hev_thresh          // hev_thresh
+        uqadd           v19.16b, v17.16b, v18.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
+        cmhi            v20.16b, v20.16b, v23.16b     // abs(P1-P0) > hev_thresh
+        cmhs            v19.16b, v22.16b, v19.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
+        cmhi            v22.16b, v21.16b, v23.16b     // abs(Q1-Q0) > hev_thresh
+        and             v16.16b, v16.16b, v19.16b
+        movi            v21.16b, #0x80
+        orr             v17.16b, v20.16b, v22.16b
+    .endif
+
+        // at this point:
+        //   v16: normal_limit
+        //   v17: hev
+
+        // convert to signed value:
+        eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
+        eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
+
+        movi           v20.8h, #3
+        ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
+        ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
+        eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
+        eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
+        mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
+        mul            v19.8h, v19.8h, v20.8h
+
+        sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
+        movi           v22.16b, #4
+        movi           v23.16b, #3
+    .if \inner
+        and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
+    .endif
+        saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
+        saddw2         v19.8h,  v19.8h, v20.16b
+        sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
+        sqxtn2         v18.16b, v19.8h
+    .if !\inner && !\simple
+        eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
+        eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
+    .endif
+        and            v18.16b, v18.16b, v16.16b         // w &= normal_limit
+
+        // registers used at this point..
+        //   v0 -> P3  (don't corrupt)
+        //   v1-v6 -> PS2-QS2
+        //   v7 -> Q3  (don't corrupt)
+        //   v17 -> hev
+        //   v18 -> w
+        //   v21 -> #0x80
+        //   v22 -> #4
+        //   v23 -> #3
+        //   v16, v19, v29 -> unused
+        //
+        // filter_common:   is4tap==1
+        //   c1 = clamp(w + 4) >> 3;
+        //   c2 = clamp(w + 3) >> 3;
+        //   Q0 = s2u(QS0 - c1);
+        //   P0 = s2u(PS0 + c2);
+
+    .if \simple
+        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+    .elseif \inner
+        // the !is4tap case of filter_common, only used for inner blocks
+        //   c3 = ((c1&~hev) + 1) >> 1;
+        //   Q1 = s2u(QS1 - c3);
+        //   P1 = s2u(PS1 + c3);
+        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
+        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        srshr          v19.16b, v19.16b, #1                // c3 >>= 1
+        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
+        sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
+        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+    .else
+        and            v20.16b, v18.16b, v17.16b           // w & hev
+        sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
+        bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
+        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+
+        // filter_mbedge:
+        //   a = clamp((27*w + 63) >> 7);
+        //   Q0 = s2u(QS0 - a);
+        //   P0 = s2u(PS0 + a);
+        //   a = clamp((18*w + 63) >> 7);
+        //   Q1 = s2u(QS1 - a);
+        //   P1 = s2u(PS1 + a);
+        //   a = clamp((9*w + 63) >> 7);
+        //   Q2 = s2u(QS2 - a);
+        //   P2 = s2u(PS2 + a);
+        movi           v17.8h,  #63
+        sshll          v22.8h,  v18.8b, #3
+        sshll2         v23.8h,  v18.16b, #3
+        saddw          v22.8h,  v22.8h, v18.8b
+        saddw2         v23.8h,  v23.8h, v18.16b
+        add            v16.8h,  v17.8h, v22.8h
+        add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
+        add            v19.8h,  v16.8h, v22.8h
+        add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
+        add            v22.8h,  v19.8h, v22.8h
+        add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
+        sqshrn         v16.8b,  v16.8h,  #7
+        sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
+        sqshrn         v19.8b,  v19.8h, #7
+        sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
+        sqshrn         v22.8b,  v22.8h, #7
+        sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
+        sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
+        sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
+        sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
+        sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
+        sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
+        sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
+        eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
+        eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
+        eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
+        eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
+        eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
+        eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
+    .endif
+.endm
+
+.macro  vp8_v_loop_filter16 name, inner=0, simple=0
+function ff_vp8_v_loop_filter16\name\()_neon, export=1
+        sub             x0,  x0,  x1,  lsl #1+!\simple
+
+        // Load pixels:
+    .if !\simple
+        ld1             {v0.16b},     [x0], x1 // P3
+        ld1             {v1.16b},     [x0], x1 // P2
+    .endif
+        ld1             {v2.16b},     [x0], x1 // P1
+        ld1             {v3.16b},     [x0], x1 // P0
+        ld1             {v4.16b},     [x0], x1 // Q0
+        ld1             {v5.16b},     [x0], x1 // Q1
+    .if !\simple
+        ld1             {v6.16b},     [x0], x1 // Q2
+        ld1             {v7.16b},     [x0]     // Q3
+        dup             v23.16b, w3                 // flim_I
+    .endif
+        dup             v22.16b, w2                 // flim_E
+
+        vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
+
+        // back up to P2:  dst -= stride * 6
+        sub             x0,  x0,  x1,  lsl #2
+    .if !\simple
+        sub             x0,  x0,  x1,  lsl #1
+
+        // Store pixels:
+        st1             {v1.16b},     [x0], x1 // P2
+    .endif
+        st1             {v2.16b},     [x0], x1 // P1
+        st1             {v3.16b},     [x0], x1 // P0
+        st1             {v4.16b},     [x0], x1 // Q0
+        st1             {v5.16b},     [x0], x1 // Q1
+    .if !\simple
+        st1             {v6.16b},     [x0]     // Q2
+    .endif
+
+        ret
+endfunc
+.endm
+
+vp8_v_loop_filter16
+vp8_v_loop_filter16 _inner,  inner=1
+vp8_v_loop_filter16 _simple, simple=1
+
+.macro  vp8_v_loop_filter8uv name, inner=0
+function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
+        sub             x0,  x0,  x2,  lsl #2
+        sub             x1,  x1,  x2,  lsl #2
+        // Load pixels:
+        ld1          {v0.d}[0],     [x0], x2  // P3
+        ld1          {v0.d}[1],     [x1], x2  // P3
+        ld1          {v1.d}[0],     [x0], x2  // P2
+        ld1          {v1.d}[1],     [x1], x2  // P2
+        ld1          {v2.d}[0],     [x0], x2  // P1
+        ld1          {v2.d}[1],     [x1], x2  // P1
+        ld1          {v3.d}[0],     [x0], x2  // P0
+        ld1          {v3.d}[1],     [x1], x2  // P0
+        ld1          {v4.d}[0],     [x0], x2  // Q0
+        ld1          {v4.d}[1],     [x1], x2  // Q0
+        ld1          {v5.d}[0],     [x0], x2  // Q1
+        ld1          {v5.d}[1],     [x1], x2  // Q1
+        ld1          {v6.d}[0],     [x0], x2  // Q2
+        ld1          {v6.d}[1],     [x1], x2  // Q2
+        ld1          {v7.d}[0],     [x0]      // Q3
+        ld1          {v7.d}[1],     [x1]      // Q3
+
+        dup          v22.16b, w3                 // flim_E
+        dup          v23.16b, w4                 // flim_I
+
+        vp8_loop_filter inner=\inner, hev_thresh=w5
+
+        // back up to P2:  u,v -= stride * 6
+        sub          x0,  x0,  x2,  lsl #2
+        sub          x1,  x1,  x2,  lsl #2
+        sub          x0,  x0,  x2,  lsl #1
+        sub          x1,  x1,  x2,  lsl #1
+
+        // Store pixels:
+
+        st1          {v1.d}[0],     [x0], x2  // P2
+        st1          {v1.d}[1],     [x1], x2  // P2
+        st1          {v2.d}[0],     [x0], x2  // P1
+        st1          {v2.d}[1],     [x1], x2  // P1
+        st1          {v3.d}[0],     [x0], x2  // P0
+        st1          {v3.d}[1],     [x1], x2  // P0
+        st1          {v4.d}[0],     [x0], x2  // Q0
+        st1          {v4.d}[1],     [x1], x2  // Q0
+        st1          {v5.d}[0],     [x0], x2  // Q1
+        st1          {v5.d}[1],     [x1], x2  // Q1
+        st1          {v6.d}[0],     [x0]      // Q2
+        st1          {v6.d}[1],     [x1]      // Q2
+
+        ret
+endfunc
+.endm
+
+vp8_v_loop_filter8uv
+vp8_v_loop_filter8uv _inner, inner=1
+
+.macro  vp8_h_loop_filter16 name, inner=0, simple=0
+function ff_vp8_h_loop_filter16\name\()_neon, export=1
+
+        sub             x0,  x0,  #4
+        // Load pixels:
+        ld1             {v0.d}[0], [x0], x1
+        ld1             {v1.d}[0], [x0], x1
+        ld1             {v2.d}[0], [x0], x1
+        ld1             {v3.d}[0], [x0], x1
+        ld1             {v4.d}[0], [x0], x1
+        ld1             {v5.d}[0], [x0], x1
+        ld1             {v6.d}[0], [x0], x1
+        ld1             {v7.d}[0], [x0], x1
+        ld1             {v0.d}[1], [x0], x1
+        ld1             {v1.d}[1], [x0], x1
+        ld1             {v2.d}[1], [x0], x1
+        ld1             {v3.d}[1], [x0], x1
+        ld1             {v4.d}[1], [x0], x1
+        ld1             {v5.d}[1], [x0], x1
+        ld1             {v6.d}[1], [x0], x1
+        ld1             {v7.d}[1], [x0], x1
+
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+
+        dup             v22.16b, w2                 // flim_E
+    .if !\simple
+        dup             v23.16b, w3                 // flim_I
+    .endif
+
+        vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
+
+        sub             x0,  x0,  x1, lsl #4    // backup 16 rows
+
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+
+        // Store pixels:
+        st1             {v0.d}[0], [x0], x1
+        st1             {v1.d}[0], [x0], x1
+        st1             {v2.d}[0], [x0], x1
+        st1             {v3.d}[0], [x0], x1
+        st1             {v4.d}[0], [x0], x1
+        st1             {v5.d}[0], [x0], x1
+        st1             {v6.d}[0], [x0], x1
+        st1             {v7.d}[0], [x0], x1
+        st1             {v0.d}[1], [x0], x1
+        st1             {v1.d}[1], [x0], x1
+        st1             {v2.d}[1], [x0], x1
+        st1             {v3.d}[1], [x0], x1
+        st1             {v4.d}[1], [x0], x1
+        st1             {v5.d}[1], [x0], x1
+        st1             {v6.d}[1], [x0], x1
+        st1             {v7.d}[1], [x0]
+
+        ret
+endfunc
+.endm
+
+vp8_h_loop_filter16
+vp8_h_loop_filter16 _inner,  inner=1
+vp8_h_loop_filter16 _simple, simple=1
+
+.macro  vp8_h_loop_filter8uv name, inner=0
+function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
+        sub             x0,  x0,  #4
+        sub             x1,  x1,  #4
+
+        // Load pixels:
+        ld1          {v0.d}[0],     [x0], x2 // load u
+        ld1          {v0.d}[1],     [x1], x2 // load v
+        ld1          {v1.d}[0],     [x0], x2
+        ld1          {v1.d}[1],     [x1], x2
+        ld1          {v2.d}[0],     [x0], x2
+        ld1          {v2.d}[1],     [x1], x2
+        ld1          {v3.d}[0],     [x0], x2
+        ld1          {v3.d}[1],     [x1], x2
+        ld1          {v4.d}[0],     [x0], x2
+        ld1          {v4.d}[1],     [x1], x2
+        ld1          {v5.d}[0],     [x0], x2
+        ld1          {v5.d}[1],     [x1], x2
+        ld1          {v6.d}[0],     [x0], x2
+        ld1          {v6.d}[1],     [x1], x2
+        ld1          {v7.d}[0],     [x0], x2
+        ld1          {v7.d}[1],     [x1], x2
+
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+
+        dup             v22.16b, w3                 // flim_E
+        dup             v23.16b, w4                 // flim_I
+
+        vp8_loop_filter inner=\inner, hev_thresh=w5
+
+        sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
+        sub             x1,  x1,  x2, lsl #3    // backup v 8 rows
+
+        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+
+        // Store pixels:
+        st1          {v0.d}[0],     [x0], x2 // load u
+        st1          {v0.d}[1],     [x1], x2 // load v
+        st1          {v1.d}[0],     [x0], x2
+        st1          {v1.d}[1],     [x1], x2
+        st1          {v2.d}[0],     [x0], x2
+        st1          {v2.d}[1],     [x1], x2
+        st1          {v3.d}[0],     [x0], x2
+        st1          {v3.d}[1],     [x1], x2
+        st1          {v4.d}[0],     [x0], x2
+        st1          {v4.d}[1],     [x1], x2
+        st1          {v5.d}[0],     [x0], x2
+        st1          {v5.d}[1],     [x1], x2
+        st1          {v6.d}[0],     [x0], x2
+        st1          {v6.d}[1],     [x1], x2
+        st1          {v7.d}[0],     [x0]
+        st1          {v7.d}[1],     [x1]
+
+        ret
+
+endfunc
+.endm
+
+vp8_h_loop_filter8uv
+vp8_h_loop_filter8uv _inner, inner=1
+
+
+function ff_put_vp8_pixels16_neon, export=1
+1:
+        subs            w4, w4, #4
+        ld1             {v0.16b},     [x2], x3
+        ld1             {v1.16b},     [x2], x3
+        ld1             {v2.16b},     [x2], x3
+        ld1             {v3.16b},     [x2], x3
+        st1             {v0.16b},     [x0], x1
+        st1             {v1.16b},     [x0], x1
+        st1             {v2.16b},     [x0], x1
+        st1             {v3.16b},     [x0], x1
+        bgt             1b
+        ret
+endfunc
+
+function ff_put_vp8_pixels8_neon, export=1
+1:
+        subs            w4, w4, #4
+        ld1             {v0.8b},   [x2], x3
+        ld1             {v0.d}[1], [x2], x3
+        ld1             {v1.8b},   [x2], x3
+        ld1             {v1.d}[1], [x2], x3
+        st1             {v0.8b},   [x0], x1
+        st1             {v0.d}[1], [x0], x1
+        st1             {v1.8b},   [x0], x1
+        st1             {v1.d}[1], [x0], x1
+        bgt             1b
+        ret
+endfunc
+
+/* 4/6-tap 8th-pel MC */
+
+.macro  vp8_epel8_h6    d,   s0,   s1
+        ext             v22.8b, \s0\().8b,  \s1\().8b,  #1
+        uxtl            v18.8h, \s0\().8b
+        ext             v23.8b, \s0\().8b,  \s1\().8b,  #2
+        uxtl            v19.8h, v22.8b
+        ext             v24.8b, \s0\().8b,  \s1\().8b,  #3
+        uxtl            v21.8h, v23.8b
+        ext             v25.8b, \s0\().8b,  \s1\().8b,  #4
+        uxtl            v22.8h, v24.8b
+        ext             v26.8b, \s0\().8b,  \s1\().8b,  #5
+        uxtl            v25.8h, v25.8b
+        mul             v21.8h, v21.8h, v0.h[2]
+        uxtl            v26.8h, v26.8b
+        mul             v22.8h, v22.8h, v0.h[3]
+        mls             v21.8h, v19.8h, v0.h[1]
+        mls             v22.8h, v25.8h, v0.h[4]
+        mla             v21.8h, v18.8h, v0.h[0]
+        mla             v22.8h, v26.8h, v0.h[5]
+        sqadd           v22.8h, v21.8h, v22.8h
+        sqrshrun        \d\().8b, v22.8h, #7
+.endm
+
+.macro  vp8_epel16_h6   d0,  v0,  v1
+        ext             v22.16b, \v0\().16b, \v1\().16b, #3
+        ext             v23.16b, \v0\().16b, \v1\().16b, #4
+        uxtl            v19.8h,  v22.8b
+        uxtl2           v22.8h,  v22.16b
+        ext             v3.16b,  \v0\().16b, \v1\().16b, #2
+        uxtl            v20.8h,  v23.8b
+        uxtl2           v23.8h,  v23.16b
+        ext             v16.16b, \v0\().16b, \v1\().16b, #1
+        uxtl            v18.8h,  v3.8b
+        uxtl2           v3.8h,   v3.16b
+        ext             v2.16b,  \v0\().16b, \v1\().16b, #5
+        uxtl            v21.8h,  v2.8b
+        uxtl2           v2.8h,   v2.16b
+        uxtl            v17.8h,  v16.8b
+        uxtl2           v16.8h,  v16.16b
+        mul             v19.8h,  v19.8h, v0.h[3]
+        mul             v18.8h,  v18.8h, v0.h[2]
+        mul             v3.8h,   v3.8h,  v0.h[2]
+        mul             v22.8h,  v22.8h, v0.h[3]
+        mls             v19.8h,  v20.8h, v0.h[4]
+        uxtl            v20.8h,  \v0\().8b
+        uxtl2           v1.8h,   \v0\().16b
+        mls             v18.8h,  v17.8h, v0.h[1]
+        mls             v3.8h,   v16.8h, v0.h[1]
+        mls             v22.8h,  v23.8h, v0.h[4]
+        mla             v18.8h,  v20.8h, v0.h[0]
+        mla             v19.8h,  v21.8h, v0.h[5]
+        mla             v3.8h,   v1.8h,  v0.h[0]
+        mla             v22.8h,  v2.8h,  v0.h[5]
+        sqadd           v19.8h,  v18.8h, v19.8h
+        sqadd           v22.8h,  v3.8h,  v22.8h
+        sqrshrun        \d0\().8b,  v19.8h, #7
+        sqrshrun2       \d0\().16b, v22.8h, #7
+.endm
+
+.macro  vp8_epel8_v6    d0,  s0,  s1,  s2, s3, s4, s5
+        uxtl            \s2\().8h, \s2\().8b
+        uxtl            \s3\().8h, \s3\().8b
+        uxtl            \s1\().8h, \s1\().8b
+        uxtl            \s4\().8h, \s4\().8b
+        uxtl            \s0\().8h, \s0\().8b
+        uxtl            \s5\().8h, \s5\().8b
+        mul             \s2\().8h, \s2\().8h, v0.h[2]
+        mul             \s3\().8h, \s3\().8h, v0.h[3]
+        mls             \s2\().8h, \s1\().8h, v0.h[1]
+        mls             \s3\().8h, \s4\().8h, v0.h[4]
+        mla             \s2\().8h, \s0\().8h, v0.h[0]
+        mla             \s3\().8h, \s5\().8h, v0.h[5]
+        sqadd           \s3\().8h, \s2\().8h, \s3\().8h
+        sqrshrun        \d0\().8b, \s3\().8h, #7
+.endm
+
+.macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
+        uxtl            \s0\().8h, \s0\().8b
+        uxtl            \s3\().8h, \s3\().8b
+        uxtl            \s6\().8h, \s6\().8b
+        uxtl            \s1\().8h, \s1\().8b
+        uxtl            \s4\().8h, \s4\().8b
+        uxtl            \s2\().8h, \s2\().8b
+        uxtl            \s5\().8h, \s5\().8b
+        mul             \s0\().8h, \s0\().8h, v0.h[0]
+        mul             v31.8h   , \s3\().8h, v0.h[3]
+        mul             \s3\().8h, \s3\().8h, v0.h[2]
+        mul             \s6\().8h, \s6\().8h, v0.h[5]
+
+        mls             \s0\().8h, \s1\().8h, v0.h[1]
+        mls             v31.8h   , \s4\().8h, v0.h[4]
+        mls             \s3\().8h, \s2\().8h, v0.h[1]
+        mls             \s6\().8h, \s5\().8h, v0.h[4]
+
+        mla             \s0\().8h, \s2\().8h, v0.h[2]
+        mla             v31.8h   , \s5\().8h, v0.h[5]
+        mla             \s3\().8h, \s1\().8h, v0.h[0]
+        mla             \s6\().8h, \s4\().8h, v0.h[3]
+        sqadd           v31.8h   , \s0\().8h, v31.8h
+        sqadd           \s6\().8h, \s3\().8h, \s6\().8h
+        sqrshrun        \d0\().8b, v31.8h,    #7
+        sqrshrun        \d1\().8b, \s6\().8h, #7
+.endm
+
+.macro  vp8_epel8_h4    d,   v0,   v1
+        ext             v22.8b, \v0\().8b,  \v1\().8b,  #1
+        uxtl            v19.8h, \v0\().8b
+        ext             v23.8b, \v0\().8b,  \v1\().8b,  #2
+        uxtl            v20.8h, v22.8b
+        ext             v25.8b, \v0\().8b,  \v1\().8b,  #3
+        uxtl            v22.8h, v23.8b
+        uxtl            v25.8h, v25.8b
+        mul             v20.8h, v20.8h, v0.h[2]
+        mul             v22.8h, v22.8h, v0.h[3]
+        mls             v20.8h, v19.8h, v0.h[1]
+        mls             v22.8h, v25.8h, v0.h[4]
+        sqadd           v22.8h, v20.8h, v22.8h
+        sqrshrun        \d\().8b, v22.8h, #7
+.endm
+
+.macro  vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
+        uxtl            \s0\().8h,  \s0\().8b
+        uxtl            \s1\().8h,  \s1\().8b
+        uxtl            \s2\().8h,  \s2\().8b
+        uxtl            \s3\().8h,  \s3\().8b
+        uxtl            \s4\().8h,  \s4\().8b
+        mul             v21.8h,     \s1\().8h, v0.h[2]
+        mul             v23.8h,     \s2\().8h, v0.h[3]
+        mul             \s2\().8h,  \s2\().8h, v0.h[2]
+        mul             v22.8h,     \s3\().8h, v0.h[3]
+        mls             v21.8h,     \s0\().8h, v0.h[1]
+        mls             v23.8h,     \s3\().8h, v0.h[4]
+        mls             \s2\().8h,  \s1\().8h, v0.h[1]
+        mls             v22.8h,     \s4\().8h, v0.h[4]
+        sqadd           v21.8h,     v21.8h,    v23.8h
+        sqadd           \s2\().8h,  \s2\().8h, v22.8h
+        sqrshrun        \d0\().8b,  v21.8h,    #7
+        sqrshrun2       \d0\().16b, \s2\().8h, #7
+.endm
+
+
+// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
+// arithmatic can be used to apply filters
+const   subpel_filters, align=4
+        .short     0,   6, 123,  12,   1,   0,   0,   0
+        .short     2,  11, 108,  36,   8,   1,   0,   0
+        .short     0,   9,  93,  50,   6,   0,   0,   0
+        .short     3,  16,  77,  77,  16,   3,   0,   0
+        .short     0,   6,  50,  93,   9,   0,   0,   0
+        .short     1,   8,  36, 108,  11,   2,   0,   0
+        .short     0,   1,  12, 123,   6,   0,   0,   0
+endconst
+
+function ff_put_vp8_epel16_v6_neon, export=1
+        sub             x2,  x2,  x3,  lsl #1
+
+        sxtw            x4,  w4
+        sxtw            x6,  w6
+        movrel          x17,  subpel_filters, -16
+        add             x6,  x17,  x6, lsl #4  // y
+        ld1             {v0.8h},     [x6]
+1:
+        ld1             {v1.1d - v2.1d},    [x2], x3
+        ld1             {v3.1d - v4.1d},    [x2], x3
+        ld1             {v16.1d - v17.1d},  [x2], x3
+        ld1             {v18.1d - v19.1d},  [x2], x3
+        ld1             {v20.1d - v21.1d},  [x2], x3
+        ld1             {v22.1d - v23.1d},  [x2], x3
+        ld1             {v24.1d - v25.1d},  [x2]
+        sub             x2,  x2,  x3, lsl #2
+
+        vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
+        vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
+
+        st1             {v1.1d - v2.1d}, [x0], x1
+        st1             {v3.1d - v4.1d}, [x0], x1
+        subs            x4, x4, #2
+        bne             1b
+
+        ret
+endfunc
+
+function ff_put_vp8_epel16_h6_neon, export=1
+        sub             x2,  x2,  #2
+        sxtw            x5,  w5 // x
+
+        // first pass (horizontal):
+        movrel          x17,  subpel_filters, -16
+        add             x5,  x17,  x5, lsl #4 // x
+        ld1             {v0.8h},  [x5]
+1:
+        ld1             {v1.16b, v2.16b}, [x2], x3
+        vp8_epel16_h6   v1, v1, v2
+        st1             {v1.16b}, [x0], x1
+
+        subs            w4, w4, #1
+        bne             1b
+        ret
+endfunc
+
+
+function ff_put_vp8_epel16_h6v6_neon, export=1
+        sub             x2,  x2,  x3,  lsl #1
+        sub             x2,  x2,  #2
+
+        // first pass (horizontal):
+        movrel          x17,  subpel_filters, -16
+        sxtw            x5,  w5 // x
+        add             x16,  x17,  x5, lsl #4 // x
+        sub             sp,  sp,  #336+16
+        ld1             {v0.8h},  [x16]
+        add             x7,  sp,  #15
+        sxtw            x4,  w4
+        add             x16, x4, #5   // h
+        bic             x7,  x7,  #15
+1:
+        ld1             {v1.16b, v2.16b}, [x2], x3
+        vp8_epel16_h6   v1, v1, v2
+        st1             {v1.16b}, [x7], #16
+        subs            x16, x16, #1
+        bne             1b
+
+
+        // second pass (vertical):
+        sxtw            x6,  w6
+        add             x6,  x17,  x6, lsl #4  // y
+        add             x7,  sp,  #15
+        ld1             {v0.8h},     [x6]
+        bic             x7,  x7,  #15
+2:
+        ld1             {v1.8b - v4.8b},    [x7], #32
+        ld1             {v16.8b - v19.8b},  [x7], #32
+        ld1             {v20.8b - v23.8b},  [x7]
+        sub             x7,  x7,  #48
+
+        vp8_epel8_v6    v5, v1, v3, v16, v18, v20, v22
+        vp8_epel8_v6    v2, v2, v4, v17, v19, v21, v23
+        trn1            v2.2d, v5.2d, v2.2d
+
+        st1             {v2.16b}, [x0], x1
+        subs            x4, x4, #1
+        bne             2b
+
+        add             sp,  sp,  #336+16
+        ret
+endfunc
+
+function ff_put_vp8_epel8_h6v6_neon, export=1
+        sub             x2,  x2,  x3,  lsl #1
+        sub             x2,  x2,  #2
+        sxtw            x4,  w4
+
+        // first pass (horizontal):
+        movrel          x17,  subpel_filters, -16
+        sxtw            x5,  w5
+        add             x5,  x17,  x5, lsl #4 // x
+        sub             sp,  sp,  #168+16
+        ld1             {v0.8h},  [x5]
+        add             x7,  sp,  #15
+        add             x16, x4,  #5   // h
+        bic             x7,  x7,  #15
+1:
+        ld1             {v1.8b, v2.8b}, [x2], x3
+
+        vp8_epel8_h6    v1, v1, v2
+
+        st1             {v1.8b}, [x7], #8
+        subs            x16, x16, #1
+        bne             1b
+
+        // second pass (vertical):
+        sxtw            x6,  w6
+        add             x6,  x17,  x6, lsl #4  // y
+        add             x7,  sp,   #15
+        ld1             {v0.8h},   [x6]
+        bic             x7,  x7,   #15
+2:
+        ld1             {v1.8b - v4.8b}, [x7], #32
+        ld1             {v5.8b - v7.8b}, [x7]
+
+        sub             x7,  x7,  #16
+
+        vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
+
+        st1             {v1.8b}, [x0], x1
+        st1             {v2.8b}, [x0], x1
+        subs            x4, x4, #2
+        bne             2b
+
+        add             sp,  sp,  #168+16
+        ret
+endfunc
+
+function ff_put_vp8_epel8_h4v6_neon, export=1
+        sub             x2,  x2,  x3,  lsl #1
+        sub             x2,  x2,  #1
+        sxtw            x4,  w4
+
+        // first pass (horizontal):
+        movrel          x17,  subpel_filters, -16
+        sxtw            x5,  w5
+        add             x5,  x17,  x5, lsl #4 // x
+        sub             sp,  sp,  #168+16
+        ld1             {v0.8h},  [x5]
+        add             x7,  sp,  #15
+        add             x16, x4, #5   // h
+        bic             x7,  x7,  #15
+1:
+        ld1             {v1.8b, v2.8b}, [x2], x3
+
+        vp8_epel8_h4    v1, v1, v2
+
+        st1             {v1.8b}, [x7], #8
+        subs            x16, x16, #1
+        bne             1b
+
+        // second pass (vertical):
+        sxtw            x6,  w6
+        add             x6,  x17,  x6, lsl #4  // y
+        add             x7,  sp,   #15
+        ld1             {v0.8h},   [x6]
+        bic             x7,  x7,   #15
+2:
+        ld1             {v1.8b - v4.8b}, [x7], #32
+        ld1             {v5.8b - v7.8b}, [x7]
+
+        sub             x7,  x7,  #16
+
+        vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
+
+        st1             {v1.8b}, [x0], x1
+        st1             {v2.8b}, [x0], x1
+        subs            x4, x4, #2
+        bne             2b
+
+        add             sp,  sp,  #168+16
+        ret
+endfunc
+
+function ff_put_vp8_epel8_h4v4_neon, export=1
+        sub             x2,  x2,  x3
+        sub             x2,  x2,  #1
+        sxtw            x4,  w4
+
+
+        // first pass (horizontal):
+        movrel          x17,  subpel_filters, -16
+        sxtw            x5,  w5
+        add             x5,  x17,  x5, lsl #4 // x
+        sub             sp,  sp,  #168+16
+        ld1             {v0.8h},  [x5]
+        add             x7,  sp,  #15
+        add             x16, x4, #3   // h
+        bic             x7,  x7,  #15
+1:
+        ld1             {v1.8b, v2.8b}, [x2], x3
+
+        vp8_epel8_h4    v1, v1, v2
+
+        st1             {v1.8b}, [x7], #8
+        subs            x16, x16, #1
+        bne             1b
+
+        // second pass (vertical):
+        sxtw            x6,  w6
+        add             x6,  x17,  x6, lsl #4  // y
+        add             x7,  sp,   #15
+        ld1             {v0.8h},   [x6]
+        bic             x7,  x7,   #15
+2:
+        ld1             {v1.8b - v2.8b}, [x7], #16
+        ld1             {v3.8b - v5.8b}, [x7]
+
+        vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
+
+        st1             {v1.d}[0], [x0], x1
+        st1             {v1.d}[1], [x0], x1
+        subs            x4, x4, #2
+        bne             2b
+
+        add             sp,  sp,  #168+16
+        ret
+endfunc
+
+function ff_put_vp8_epel8_h6v4_neon, export=1
+        sub             x2,  x2,  x3
+        sub             x2,  x2,  #2
+        sxtw            x4,  w4
+
+
+        // first pass (horizontal):
+        movrel          x17,  subpel_filters, -16
+        sxtw            x5,  w5
+        add             x5,  x17,  x5, lsl #4 // x
+        sub             sp,  sp,  #168+16
+        ld1             {v0.8h},  [x5]
+        add             x7,  sp,  #15
+        add             x16, x4, #3   // h
+        bic             x7,  x7,  #15
+1:
+        ld1             {v1.8b, v2.8b}, [x2], x3
+
+        vp8_epel8_h6    v1, v1, v2
+
+        st1             {v1.8b}, [x7], #8
+        subs            x16, x16, #1
+        bne             1b
+
+        // second pass (vertical):
+        sxtw            x6,  w6
+        add             x6,  x17,  x6, lsl #4  // y
+        add             x7,  sp,   #15
+        ld1             {v0.8h},   [x6]
+        bic             x7,  x7,   #15
+2:
+        ld1             {v1.8b - v2.8b}, [x7], #16
+        ld1             {v3.8b - v5.8b}, [x7]
+
+        vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
+
+        st1             {v1.d}[0], [x0], x1
+        st1             {v1.d}[1], [x0], x1
+        subs            x4, x4, #2
+        bne             2b
+
+        add             sp,  sp,  #168+16
+        ret
+endfunc
diff --git a/libavcodec/aarch64/vp9dsp_init.h b/libavcodec/aarch64/vp9dsp_init.h
new file mode 100644
index 0000000000..9df1752c62
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
+#define AVCODEC_AARCH64_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
diff --git a/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c b/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
new file mode 100644
index 0000000000..0fa0d7f8c2
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
diff --git a/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c b/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
new file mode 100644
index 0000000000..dae2232403
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
diff --git a/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c b/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
new file mode 100644
index 0000000000..8dcfdeaaf7
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/aarch64/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix)                                          \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp)                                                   \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp)                                      \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src,                 \
+                                                ptrdiff_t src_stride,               \
+                                                int h, int mx, int my)              \
+{                                                                                   \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]);         \
+    /* We only need h + 7 lines, but the horizontal filter assumes an               \
+     * even number of rows, so filter h + 8 lines here. */                          \
+    ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz,                          \
+                                             src - 3 * src_stride, src_stride,      \
+                                             h + 8, mx, 0);                         \
+    ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride,                    \
+                                                temp + 3 * 2 * sz, 2 * sz,          \
+                                                h, 0, my);                          \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp)  \
+    decl_mc_func(op, regular, dir, sz, bpp); \
+    decl_mc_func(op, sharp,   dir, sz, bpp); \
+    decl_mc_func(op, smooth,  dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp)           \
+    decl_filter_funcs(put, h,  sz, bpp); \
+    decl_filter_funcs(avg, h,  sz, bpp); \
+    decl_filter_funcs(put, v,  sz, bpp); \
+    decl_filter_funcs(avg, v,  sz, bpp); \
+    decl_filter_funcs(put, hv, sz, bpp); \
+    decl_filter_funcs(avg, hv, sz, bpp)
+
+#define ff_vp9_copy32_neon  ff_vp9_copy32_aarch64
+#define ff_vp9_copy64_neon  ff_vp9_copy64_aarch64
+#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64,  );
+declare_fpel(copy, 32,  );
+declare_fpel(copy, 16,  );
+declare_fpel(copy, 8,   );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8,  _16);
+declare_fpel(avg, 4,  _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8, BPP);
+decl_mc_funcs(4, BPP);
+
+#define define_8tap_2d_funcs(sz, bpp)        \
+    define_8tap_2d_fn(put, regular, sz, bpp) \
+    define_8tap_2d_fn(put, sharp,   sz, bpp) \
+    define_8tap_2d_fn(put, smooth,  sz, bpp) \
+    define_8tap_2d_fn(avg, regular, sz, bpp) \
+    define_8tap_2d_fn(avg, sharp,   sz, bpp) \
+    define_8tap_2d_fn(avg, smooth,  sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8,  BPP)
+define_8tap_2d_funcs(4,  BPP)
+
+static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, suffix)      \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix
+
+#define init_copy(idx, sz, suffix) \
+    init_fpel(idx, 0, sz, copy, suffix)
+
+#define init_avg(idx, sz, suffix) \
+    init_fpel(idx, 1, sz, avg,  suffix)
+
+#define init_copy_avg(idx, sz1, sz2) \
+    init_copy(idx, sz2, _neon);      \
+    init_avg (idx, sz1, _16_neon)
+
+    if (have_armv8(cpu_flags)) {
+        init_copy(0, 128, _aarch64);
+        init_copy(1, 64,  _aarch64);
+        init_copy(2, 32,  _aarch64);
+    }
+
+    if (have_neon(cpu_flags)) {
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp)                                   \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp)            \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, hv, 1, 1, sz,        , bpp)
+
+
+        init_avg(0, 64, _16_neon);
+        init_avg(1, 32, _16_neon);
+        init_avg(2, 16, _16_neon);
+        init_copy_avg(3, 8, 16);
+        init_copy_avg(4, 4, 8);
+
+        init_mc_funcs_dirs(0, 64, BPP);
+        init_mc_funcs_dirs(1, 32, BPP);
+        init_mc_funcs_dirs(2, 16, BPP);
+        init_mc_funcs_dirs(3, 8,  BPP);
+        init_mc_funcs_dirs(4, 4,  BPP);
+    }
+}
+
+#define define_itxfm2(type_a, type_b, sz, bpp)                                     \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst,    \
+                                                                 ptrdiff_t stride, \
+                                                                 int16_t *_block, int eob)
+#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
+
+#define define_itxfm_funcs(sz, bpp)      \
+    define_itxfm(idct,  idct,  sz, bpp); \
+    define_itxfm(iadst, idct,  sz, bpp); \
+    define_itxfm(idct,  iadst, sz, bpp); \
+    define_itxfm(iadst, iadst, sz, bpp)
+
+define_itxfm_funcs(4,  BPP);
+define_itxfm_funcs(8,  BPP);
+define_itxfm_funcs(16, BPP);
+define_itxfm(idct, idct, 32, BPP);
+define_itxfm(iwht, iwht, 4,  BPP);
+
+
+static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm2(tx, sz, bpp)                                               \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_##bpp##_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
+#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
+
+#define init_idct2(tx, nm, bpp)     \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
+#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
+
+        init_itxfm(TX_4X4,   4x4,   BPP);
+        init_itxfm(TX_8X8,   8x8,   BPP);
+        init_itxfm(TX_16X16, 16x16, BPP);
+        init_idct(TX_32X32, idct_idct_32x32, BPP);
+        init_idct(4,        iwht_iwht_4x4,   BPP);
+    }
+}
+
+#define define_loop_filter(dir, wd, size, bpp) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size, bpp) \
+    define_loop_filter(h, wd, size, bpp);  \
+    define_loop_filter(v, wd, size, bpp)
+
+define_loop_filters(4,  8,  BPP);
+define_loop_filters(8,  8,  BPP);
+define_loop_filters(16, 8,  BPP);
+
+define_loop_filters(16, 16, BPP);
+
+define_loop_filters(44, 16, BPP);
+define_loop_filters(48, 16, BPP);
+define_loop_filters(84, 16, BPP);
+define_loop_filters(88, 16, BPP);
+
+static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
+
+#define init_lpf_func_16(idx, dir, bpp) \
+    dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
+
+#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
+
+#define init_lpf_funcs_8_wd(idx, wd, bpp) \
+    init_lpf_func_8(idx, 0, h, wd, bpp);  \
+    init_lpf_func_8(idx, 1, v, wd, bpp)
+
+#define init_lpf_funcs_16(bpp)   \
+    init_lpf_func_16(0, h, bpp); \
+    init_lpf_func_16(1, v, bpp)
+
+#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
+    init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp);  \
+    init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
+
+#define init_lpf_funcs_8(bpp)        \
+    init_lpf_funcs_8_wd(0, 4,  bpp); \
+    init_lpf_funcs_8_wd(1, 8,  bpp); \
+    init_lpf_funcs_8_wd(2, 16, bpp)
+
+#define init_lpf_funcs_mix2(bpp)           \
+    init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
+    init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
+    init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
+    init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
+
+        init_lpf_funcs_8(BPP);
+        init_lpf_funcs_16(BPP);
+        init_lpf_funcs_mix2(BPP);
+    }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+    vp9dsp_mc_init_aarch64(dsp);
+    vp9dsp_loopfilter_init_aarch64(dsp);
+    vp9dsp_itxfm_init_aarch64(dsp);
+}
diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c
index 3ce2c1b2b9..4c699759fe 100644
--- a/libavcodec/aarch64/vp9dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c
@@ -1,28 +1,30 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdint.h>
 
 #include "libavutil/attributes.h"
+#include "libavutil/internal.h"
 #include "libavutil/aarch64/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
 
 #define declare_fpel(type, sz)                                          \
 void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
@@ -239,8 +241,17 @@ static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
     }
 }
 
-av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp)
+av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
 {
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_aarch64(dsp);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_aarch64(dsp);
+        return;
+    } else if (bpp != 8)
+        return;
+
     vp9dsp_mc_init_aarch64(dsp);
     vp9dsp_loopfilter_init_aarch64(dsp);
     vp9dsp_itxfm_init_aarch64(dsp);
diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
new file mode 100644
index 0000000000..68296d9c40
--- /dev/null
+++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -0,0 +1,2017 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+        .short  11585, 0, 6270, 15137
+iadst4_coeffs:
+        .short  5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
+        trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
+        trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
+        trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
+        trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
+        trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
+        trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
+        trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
+        trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
+.endm
+
+// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
+// over two registers.
+.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
+        transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
+        transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
+
+        // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
+        // while swapping the two 4x4 matrices between each other
+
+        // First step of the 4x4 transpose of r1-r7, into t0-t3
+        trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
+        trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
+        trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
+
+        // First step of the 4x4 transpose of r8-r12, into r1-r7
+        trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
+        trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
+        trn1            \r5\().4s,  \r12\().4s, \r14\().4s
+        trn2            \r7\().4s,  \r12\().4s, \r14\().4s
+
+        // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
+        trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
+        trn2            \r12\().2d, \t0\().2d,  \t2\().2d
+        trn1            \r10\().2d, \t1\().2d,  \t3\().2d
+        trn2            \r14\().2d, \t1\().2d,  \t3\().2d
+
+        // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
+        trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
+        trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
+        trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
+        trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
+
+        // Move the outputs of trn1 back in place
+        mov             \r1\().16b,  \t0\().16b
+        mov             \r3\().16b,  \t1\().16b
+.endm
+
+// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+// in/out are .4s registers; this can do with 4 temp registers, but is
+// more efficient if 6 temp registers are available.
+.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
+.if \neg > 0
+        neg             \tmp4\().4s, v0.4s
+.endif
+        add             \tmp1\().4s, \in1\().4s,  \in2\().4s
+        sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
+.if \neg > 0
+        smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
+        smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
+.else
+        smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
+        smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
+.endif
+.ifb \tmp5
+        rshrn           \out1\().2s, \tmp3\().2d, #14
+        rshrn2          \out1\().4s, \tmp4\().2d, #14
+        smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
+        smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
+        rshrn           \out2\().2s, \tmp3\().2d, #14
+        rshrn2          \out2\().4s, \tmp4\().2d, #14
+.else
+        smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
+        smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
+        rshrn           \out1\().2s, \tmp3\().2d, #14
+        rshrn2          \out1\().4s, \tmp4\().2d, #14
+        rshrn           \out2\().2s, \tmp5\().2d, #14
+        rshrn2          \out2\().4s, \tmp6\().2d, #14
+.endif
+.endm
+
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+        smull           \tmp1\().2d, \in1\().2s,  v0.s[0]
+        smull2          \tmp2\().2d, \in1\().4s,  v0.s[0]
+        rshrn           \out1\().2s, \tmp1\().2d, #14
+        rshrn2          \out1\().4s, \tmp2\().2d, #14
+        rshrn           \out2\().2s, \tmp1\().2d, #14
+        rshrn2          \out2\().4s, \tmp2\().2d, #14
+.endm
+
+// out1,out2 = in1 * coef1 - in2 * coef2
+// out3,out4 = in1 * coef2 + in2 * coef1
+// out are 4 x .2d registers, in are 2 x .4s registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
+        smull           \out1\().2d, \in1\().2s, \coef1
+        smull2          \out2\().2d, \in1\().4s, \coef1
+        smull           \out3\().2d, \in1\().2s, \coef2
+        smull2          \out4\().2d, \in1\().4s, \coef2
+        smlsl           \out1\().2d, \in2\().2s, \coef2
+        smlsl2          \out2\().2d, \in2\().4s, \coef2
+        smlal           \out3\().2d, \in2\().2s, \coef1
+        smlal2          \out4\().2d, \in2\().4s, \coef1
+.endm
+
+// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+// inout are 2 x .4s registers
+.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
+        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+        neg             \tmp3\().2d, \tmp3\().2d
+        neg             \tmp4\().2d, \tmp4\().2d
+.endif
+        rshrn           \inout1\().2s, \tmp1\().2d,  #14
+        rshrn2          \inout1\().4s, \tmp2\().2d,  #14
+        rshrn           \inout2\().2s, \tmp3\().2d,  #14
+        rshrn2          \inout2\().4s, \tmp4\().2d,  #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().2d, \inout1\().2s, \coef1
+        smull2          \tmp2\().2d, \inout1\().4s, \coef1
+        smull           \tmp3\().2d, \inout1\().2s, \coef2
+        smull2          \tmp4\().2d, \inout1\().4s, \coef2
+        rshrn           \inout1\().2s, \tmp1\().2d, #14
+        rshrn2          \inout1\().4s, \tmp2\().2d, #14
+        rshrn           \inout2\().2s, \tmp3\().2d, #14
+        rshrn2          \inout2\().4s, \tmp4\().2d, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().2d, \inout2\().2s, \coef2
+        smull2          \tmp2\().2d, \inout2\().4s, \coef2
+        smull           \tmp3\().2d, \inout2\().2s, \coef1
+        smull2          \tmp4\().2d, \inout2\().4s, \coef1
+        neg             \tmp1\().2d, \tmp1\().2d
+        neg             \tmp2\().2d, \tmp2\().2d
+        rshrn           \inout2\().2s, \tmp3\().2d, #14
+        rshrn2          \inout2\().4s, \tmp4\().2d, #14
+        rshrn           \inout1\().2s, \tmp1\().2d, #14
+        rshrn2          \inout1\().4s, \tmp2\().2d, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+        smull           \out1\().2d, \in\().2s, \coef
+        smull2          \out2\().2d, \in\().4s, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+        rshrn           \out\().2s, \in1\().2d, \shift
+        rshrn2          \out\().4s, \in2\().2d, \shift
+.endm
+
+
+// out1 = in1 + in2
+// out2 = in1 - in2
+.macro butterfly_4s out1, out2, in1, in2
+        add             \out1\().4s, \in1\().4s, \in2\().4s
+        sub             \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = in1 - in2
+// out2 = in1 + in2
+.macro butterfly_4s_r out1, out2, in1, in2
+        sub             \out1\().4s, \in1\().4s, \in2\().4s
+        add             \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+// out are 2 x .4s registers, in are 4 x .2d registers
+.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+        add             \tmp1\().2d, \in1\().2d, \in3\().2d
+        add             \tmp2\().2d, \in2\().2d, \in4\().2d
+        sub             \tmp3\().2d, \in1\().2d, \in3\().2d
+        sub             \tmp4\().2d, \in2\().2d, \in4\().2d
+        rshrn           \out1\().2s, \tmp1\().2d,  #14
+        rshrn2          \out1\().4s, \tmp2\().2d,  #14
+        rshrn           \out2\().2s, \tmp3\().2d,  #14
+        rshrn2          \out2\().4s, \tmp4\().2d,  #14
+.endm
+
+.macro iwht4_10 c0, c1, c2, c3
+        add             \c0\().4s, \c0\().4s, \c1\().4s
+        sub             v17.4s,    \c2\().4s, \c3\().4s
+        sub             v16.4s,    \c0\().4s, v17.4s
+        sshr            v16.4s,    v16.4s,    #1
+        sub             \c2\().4s, v16.4s,    \c1\().4s
+        sub             \c1\().4s, v16.4s,    \c3\().4s
+        add             \c3\().4s, v17.4s,    \c2\().4s
+        sub             \c0\().4s, \c0\().4s, \c1\().4s
+.endm
+
+.macro iwht4_12 c0, c1, c2, c3
+        iwht4_10        \c0, \c1, \c2, \c3
+.endm
+
+.macro idct4_10 c0, c1, c2, c3
+        mul             v22.4s,    \c1\().4s, v0.s[3]
+        mul             v20.4s,    \c1\().4s, v0.s[2]
+        add             v16.4s,    \c0\().4s, \c2\().4s
+        sub             v17.4s,    \c0\().4s, \c2\().4s
+        mla             v22.4s,    \c3\().4s, v0.s[2]
+        mul             v18.4s,    v16.4s,    v0.s[0]
+        mul             v24.4s,    v17.4s,    v0.s[0]
+        mls             v20.4s,    \c3\().4s, v0.s[3]
+        srshr           v22.4s,    v22.4s,    #14
+        srshr           v18.4s,    v18.4s,    #14
+        srshr           v24.4s,    v24.4s,    #14
+        srshr           v20.4s,    v20.4s,    #14
+        add             \c0\().4s, v18.4s,    v22.4s
+        sub             \c3\().4s, v18.4s,    v22.4s
+        add             \c1\().4s, v24.4s,    v20.4s
+        sub             \c2\().4s, v24.4s,    v20.4s
+.endm
+
+.macro idct4_12 c0, c1, c2, c3
+        smull           v22.2d,    \c1\().2s, v0.s[3]
+        smull2          v23.2d,    \c1\().4s, v0.s[3]
+        smull           v20.2d,    \c1\().2s, v0.s[2]
+        smull2          v21.2d,    \c1\().4s, v0.s[2]
+        add             v16.4s,    \c0\().4s, \c2\().4s
+        sub             v17.4s,    \c0\().4s, \c2\().4s
+        smlal           v22.2d,    \c3\().2s, v0.s[2]
+        smlal2          v23.2d,    \c3\().4s, v0.s[2]
+        smull           v18.2d,    v16.2s,    v0.s[0]
+        smull2          v19.2d,    v16.4s,    v0.s[0]
+        smull           v24.2d,    v17.2s,    v0.s[0]
+        smull2          v25.2d,    v17.4s,    v0.s[0]
+        smlsl           v20.2d,    \c3\().2s, v0.s[3]
+        smlsl2          v21.2d,    \c3\().4s, v0.s[3]
+        rshrn           v22.2s,    v22.2d,    #14
+        rshrn2          v22.4s,    v23.2d,    #14
+        rshrn           v18.2s,    v18.2d,    #14
+        rshrn2          v18.4s,    v19.2d,    #14
+        rshrn           v24.2s,    v24.2d,    #14
+        rshrn2          v24.4s,    v25.2d,    #14
+        rshrn           v20.2s,    v20.2d,    #14
+        rshrn2          v20.4s,    v21.2d,    #14
+        add             \c0\().4s, v18.4s,    v22.4s
+        sub             \c3\().4s, v18.4s,    v22.4s
+        add             \c1\().4s, v24.4s,    v20.4s
+        sub             \c2\().4s, v24.4s,    v20.4s
+.endm
+
+.macro iadst4_10 c0, c1, c2, c3
+        mul             v16.4s,    \c0\().4s, v1.s[0]
+        mla             v16.4s,    \c2\().4s, v1.s[1]
+        mla             v16.4s,    \c3\().4s, v1.s[2]
+        mul             v18.4s,    \c0\().4s, v1.s[2]
+        mls             v18.4s,    \c2\().4s, v1.s[0]
+        sub             \c0\().4s, \c0\().4s, \c2\().4s
+        mls             v18.4s,    \c3\().4s, v1.s[1]
+        add             \c0\().4s, \c0\().4s, \c3\().4s
+        mul             v22.4s,    \c1\().4s, v1.s[3]
+        mul             v20.4s,    \c0\().4s, v1.s[3]
+        add             v24.4s,    v16.4s,    v22.4s
+        add             v26.4s,    v18.4s,    v22.4s
+        srshr           \c0\().4s, v24.4s,    #14
+        add             v16.4s,    v16.4s,    v18.4s
+        srshr           \c1\().4s, v26.4s,    #14
+        sub             v16.4s,    v16.4s,    v22.4s
+        srshr           \c2\().4s, v20.4s,    #14
+        srshr           \c3\().4s, v16.4s,    #14
+.endm
+
+.macro iadst4_12 c0, c1, c2, c3
+        smull           v16.2d,    \c0\().2s, v1.s[0]
+        smull2          v17.2d,    \c0\().4s, v1.s[0]
+        smlal           v16.2d,    \c2\().2s, v1.s[1]
+        smlal2          v17.2d,    \c2\().4s, v1.s[1]
+        smlal           v16.2d,    \c3\().2s, v1.s[2]
+        smlal2          v17.2d,    \c3\().4s, v1.s[2]
+        smull           v18.2d,    \c0\().2s, v1.s[2]
+        smull2          v19.2d,    \c0\().4s, v1.s[2]
+        smlsl           v18.2d,    \c2\().2s, v1.s[0]
+        smlsl2          v19.2d,    \c2\().4s, v1.s[0]
+        sub             \c0\().4s, \c0\().4s, \c2\().4s
+        smlsl           v18.2d,    \c3\().2s, v1.s[1]
+        smlsl2          v19.2d,    \c3\().4s, v1.s[1]
+        add             \c0\().4s, \c0\().4s, \c3\().4s
+        smull           v22.2d,    \c1\().2s, v1.s[3]
+        smull2          v23.2d,    \c1\().4s, v1.s[3]
+        smull           v20.2d,    \c0\().2s, v1.s[3]
+        smull2          v21.2d,    \c0\().4s, v1.s[3]
+        add             v24.2d,    v16.2d,    v22.2d
+        add             v25.2d,    v17.2d,    v23.2d
+        add             v26.2d,    v18.2d,    v22.2d
+        add             v27.2d,    v19.2d,    v23.2d
+        rshrn           \c0\().2s, v24.2d,    #14
+        rshrn2          \c0\().4s, v25.2d,    #14
+        add             v16.2d,    v16.2d,    v18.2d
+        add             v17.2d,    v17.2d,    v19.2d
+        rshrn           \c1\().2s, v26.2d,    #14
+        rshrn2          \c1\().4s, v27.2d,    #14
+        sub             v16.2d,    v16.2d,    v22.2d
+        sub             v17.2d,    v17.2d,    v23.2d
+        rshrn           \c2\().2s, v20.2d,    #14
+        rshrn2          \c2\().4s, v21.2d,    #14
+        rshrn           \c3\().2s, v16.2d,    #14
+        rshrn2          \c3\().4s, v17.2d,    #14
+.endm
+
+// The public functions in this file have got the following signature:
+// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2, bpp
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.4h}, [x4]
+        sxtl            v0.4s,  v0.4h
+.endif
+.ifc \txfm1,iadst
+        movrel          x4,  iadst4_coeffs
+        ld1             {v0.d}[1], [x4]
+        sxtl2           v1.4s,  v0.8h
+.endif
+.else
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.8h}, [x4]
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+.endif
+
+        movi            v30.4s, #0
+        movi            v31.4s, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.ne            1f
+        // DC-only for idct/idct
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s, v0.s[0]
+        rshrn           v2.2s,  v2.2d, #14
+        smull           v2.2d,  v2.2s, v0.s[0]
+        rshrn           v2.2s,  v2.2d, #14
+        st1             {v31.s}[0], [x2]
+        dup             v4.4s,  v2.s[0]
+        mov             v5.16b, v4.16b
+        mov             v6.16b, v4.16b
+        mov             v7.16b, v4.16b
+        b               2f
+.endif
+
+1:
+        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
+        st1             {v30.4s,v31.4s}, [x2], #32
+
+.ifc \txfm1,iwht
+        sshr            v4.4s,  v4.4s,  #2
+        sshr            v5.4s,  v5.4s,  #2
+        sshr            v6.4s,  v6.4s,  #2
+        sshr            v7.4s,  v7.4s,  #2
+.endif
+
+        \txfm1\()4_\bpp v4,  v5,  v6,  v7
+
+        st1             {v30.4s,v31.4s}, [x2], #32
+        // Transpose 4x4 with 32 bit elements
+        transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
+
+        \txfm2\()4_\bpp v4,  v5,  v6,  v7
+2:
+        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+        ld1             {v0.4h},   [x0], x1
+        ld1             {v1.4h},   [x0], x1
+.ifnc \txfm1,iwht
+        srshr           v4.4s,  v4.4s,  #4
+        srshr           v5.4s,  v5.4s,  #4
+        srshr           v6.4s,  v6.4s,  #4
+        srshr           v7.4s,  v7.4s,  #4
+.endif
+        uaddw           v4.4s,  v4.4s,  v0.4h
+        uaddw           v5.4s,  v5.4s,  v1.4h
+        ld1             {v2.4h},   [x0], x1
+        ld1             {v3.4h},   [x0], x1
+        sqxtun          v0.4h,  v4.4s
+        sqxtun2         v0.8h,  v5.4s
+        sub             x0,  x0,  x1, lsl #2
+
+        uaddw           v6.4s,  v6.4s,  v2.4h
+        umin            v0.8h,  v0.8h,  v31.8h
+        uaddw           v7.4s,  v7.4s,  v3.4h
+        st1             {v0.4h},   [x0], x1
+        sqxtun          v2.4h,  v6.4s
+        sqxtun2         v2.8h,  v7.4s
+        umin            v2.8h,  v2.8h,  v31.8h
+
+        st1             {v0.d}[1], [x0], x1
+        st1             {v2.4h},   [x0], x1
+        st1             {v2.d}[1], [x0], x1
+
+        ret
+endfunc
+.endm
+
+.macro itxfm_funcs4x4 bpp
+itxfm_func4x4 idct,  idct,  \bpp
+itxfm_func4x4 iadst, idct,  \bpp
+itxfm_func4x4 idct,  iadst, \bpp
+itxfm_func4x4 iadst, iadst, \bpp
+itxfm_func4x4 iwht,  iwht,  \bpp
+.endm
+
+itxfm_funcs4x4 10
+itxfm_funcs4x4 12
+
+function idct8x8_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+
+        movi            v1.4h,  #0
+        sxtl            v0.4s,  v0.4h
+
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        st1             {v1.s}[0],  [x2]
+        dup             v2.4s,  v2.s[0]
+
+        srshr           v2.4s,  v2.4s,  #5
+
+        mov             x4,  #8
+        mov             x3,  x0
+        dup             v31.8h, w5
+1:
+        // Loop to add the constant from v2 into all 8x8 outputs
+        subs            x4,  x4,  #2
+        ld1             {v3.8h},  [x0], x1
+        ld1             {v4.8h},  [x0], x1
+        uaddw           v16.4s, v2.4s,  v3.4h
+        uaddw2          v17.4s, v2.4s,  v3.8h
+        uaddw           v18.4s, v2.4s,  v4.4h
+        uaddw2          v19.4s, v2.4s,  v4.8h
+        sqxtun          v3.4h,  v16.4s
+        sqxtun2         v3.8h,  v17.4s
+        sqxtun          v4.4h,  v18.4s
+        sqxtun2         v4.8h,  v19.4s
+        umin            v3.8h,  v3.8h,  v31.8h
+        umin            v4.8h,  v4.8h,  v31.8h
+        st1             {v3.8h},  [x3], x1
+        st1             {v4.8h},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+        dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
+        dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
+        dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
+        dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
+
+        butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
+        butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
+        butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
+        butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
+
+        dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
+
+        butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
+        butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
+        butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
+        butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
+.endm
+
+.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+        dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
+        dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
+
+        dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
+        dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
+
+        dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
+        dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
+
+        dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
+        dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
+
+        butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
+        neg             \r7\().4s, \r7\().4s // r7 = out[7]
+        butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
+
+        dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
+        dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
+
+        dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
+
+        dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
+        neg             \r3\().4s, \r3\().4s  // r3 = out[3]
+
+        dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
+        neg             \r1\().4s, \r1\().4s  // r1 = out[1]
+
+        dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
+        neg             \r5\().4s, \r5\().4s  // r5 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.eq            idct8x8_dc_add_neon
+.endif
+        // The iadst also uses a few coefficients from
+        // idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+        movrel          x4,  idct_coeffs
+.else
+        movrel          x4,  iadst8_coeffs
+        ld1             {v1.8h}, [x4], #16
+        stp             d8,  d9,  [sp, #-0x10]!
+        sxtl2           v3.4s,  v1.8h
+        sxtl            v2.4s,  v1.4h
+.endif
+        ld1             {v0.8h}, [x4]
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+
+        movi            v4.4s, #0
+        movi            v5.4s, #0
+        movi            v6.4s, #0
+        movi            v7.4s, #0
+
+1:
+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
+        ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
+        ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
+        ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
+        sub             x2,  x2,  #256
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
+        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
+.else
+        \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
+        \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
+.endif
+
+        // Transpose 8x8 with 16 bit elements
+        transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
+        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
+.else
+        \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
+        \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
+.endif
+2:
+        mov             x3,  x0
+        // Add into the destination
+        ld1             {v0.8h},  [x0], x1
+        srshr           v16.4s, v16.4s, #5
+        srshr           v17.4s, v17.4s, #5
+        ld1             {v1.8h},  [x0], x1
+        srshr           v18.4s, v18.4s, #5
+        srshr           v19.4s, v19.4s, #5
+        ld1             {v2.8h},  [x0], x1
+        srshr           v20.4s, v20.4s, #5
+        srshr           v21.4s, v21.4s, #5
+        uaddw           v16.4s, v16.4s, v0.4h
+        uaddw2          v17.4s, v17.4s, v0.8h
+        ld1             {v3.8h},  [x0], x1
+        srshr           v22.4s, v22.4s, #5
+        srshr           v23.4s, v23.4s, #5
+        uaddw           v18.4s, v18.4s, v1.4h
+        uaddw2          v19.4s, v19.4s, v1.8h
+        ld1             {v4.8h},  [x0], x1
+        srshr           v24.4s, v24.4s, #5
+        srshr           v25.4s, v25.4s, #5
+        uaddw           v20.4s, v20.4s, v2.4h
+        uaddw2          v21.4s, v21.4s, v2.8h
+        sqxtun          v0.4h,  v16.4s
+        sqxtun2         v0.8h,  v17.4s
+        dup             v16.8h, w5
+        ld1             {v5.8h},  [x0], x1
+        srshr           v26.4s, v26.4s, #5
+        srshr           v27.4s, v27.4s, #5
+        uaddw           v22.4s, v22.4s, v3.4h
+        uaddw2          v23.4s, v23.4s, v3.8h
+        sqxtun          v1.4h,  v18.4s
+        sqxtun2         v1.8h,  v19.4s
+        umin            v0.8h,  v0.8h,  v16.8h
+        ld1             {v6.8h},  [x0], x1
+        srshr           v28.4s, v28.4s, #5
+        srshr           v29.4s, v29.4s, #5
+        uaddw           v24.4s, v24.4s, v4.4h
+        uaddw2          v25.4s, v25.4s, v4.8h
+        sqxtun          v2.4h,  v20.4s
+        sqxtun2         v2.8h,  v21.4s
+        umin            v1.8h,  v1.8h,  v16.8h
+        ld1             {v7.8h},  [x0], x1
+        srshr           v30.4s, v30.4s, #5
+        srshr           v31.4s, v31.4s, #5
+        uaddw           v26.4s, v26.4s, v5.4h
+        uaddw2          v27.4s, v27.4s, v5.8h
+        sqxtun          v3.4h,  v22.4s
+        sqxtun2         v3.8h,  v23.4s
+        umin            v2.8h,  v2.8h,  v16.8h
+
+        st1             {v0.8h},  [x3], x1
+        uaddw           v28.4s, v28.4s, v6.4h
+        uaddw2          v29.4s, v29.4s, v6.8h
+        st1             {v1.8h},  [x3], x1
+        sqxtun          v4.4h,  v24.4s
+        sqxtun2         v4.8h,  v25.4s
+        umin            v3.8h,  v3.8h,  v16.8h
+        st1             {v2.8h},  [x3], x1
+        uaddw           v30.4s, v30.4s, v7.4h
+        uaddw2          v31.4s, v31.4s, v7.8h
+        st1             {v3.8h},  [x3], x1
+        sqxtun          v5.4h,  v26.4s
+        sqxtun2         v5.8h,  v27.4s
+        umin            v4.8h,  v4.8h,  v16.8h
+        st1             {v4.8h},  [x3], x1
+        sqxtun          v6.4h,  v28.4s
+        sqxtun2         v6.8h,  v29.4s
+        umin            v5.8h,  v5.8h,  v16.8h
+        st1             {v5.8h},  [x3], x1
+        sqxtun          v7.4h,  v30.4s
+        sqxtun2         v7.8h,  v31.4s
+        umin            v6.8h,  v6.8h,  v16.8h
+
+        st1             {v6.8h},  [x3], x1
+        umin            v7.8h,  v7.8h,  v16.8h
+        st1             {v7.8h},  [x3], x1
+
+.ifnc \txfm1\()_\txfm2,idct_idct
+        ldp             d8,  d9,  [sp], 0x10
+.endif
+        ret
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
+        mov             x5,  #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
+        mov             x5,  #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+.endm
+
+itxfm_func8x8 idct,  idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct,  iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+        sxtl            v0.4s,  v0.4h
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        st1             {v1.s}[0],  [x2]
+        dup             v2.4s,  v2.s[0]
+
+        srshr           v0.4s,  v2.4s,  #6
+
+        mov             x3, x0
+        mov             x4, #16
+        dup             v31.8h, w13
+1:
+        // Loop to add the constant from v2 into all 16x16 outputs
+        subs            x4,  x4,  #2
+        ld1             {v1.8h,v2.8h},  [x0], x1
+        uaddw           v16.4s, v0.4s,  v1.4h
+        uaddw2          v17.4s, v0.4s,  v1.8h
+        ld1             {v3.8h,v4.8h},  [x0], x1
+        uaddw           v18.4s, v0.4s,  v2.4h
+        uaddw2          v19.4s, v0.4s,  v2.8h
+        uaddw           v20.4s, v0.4s,  v3.4h
+        uaddw2          v21.4s, v0.4s,  v3.8h
+        uaddw           v22.4s, v0.4s,  v4.4h
+        uaddw2          v23.4s, v0.4s,  v4.8h
+        sqxtun          v1.4h,  v16.4s
+        sqxtun2         v1.8h,  v17.4s
+        sqxtun          v2.4h,  v18.4s
+        sqxtun2         v2.8h,  v19.4s
+        sqxtun          v3.4h,  v20.4s
+        sqxtun2         v3.8h,  v21.4s
+        sqxtun          v4.4h,  v22.4s
+        sqxtun2         v4.8h,  v23.4s
+        umin            v1.8h,  v1.8h,  v31.8h
+        umin            v2.8h,  v2.8h,  v31.8h
+        st1             {v1.8h,v2.8h},  [x3], x1
+        umin            v3.8h,  v3.8h,  v31.8h
+        umin            v4.8h,  v4.8h,  v31.8h
+        st1             {v3.8h,v4.8h},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct16_end
+        butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
+        butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
+        butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
+        butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
+        butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
+        butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
+        butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
+        butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
+
+        dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
+        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
+
+        butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
+        butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
+        butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
+        butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
+        butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
+        butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
+        butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
+        butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
+        ret
+.endm
+
+function idct16
+        dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
+        dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
+        dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
+        dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
+        dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
+        dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
+        dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+        dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_half
+        dmbutterfly0_h  v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
+        dmbutterfly_h1  v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
+        dmbutterfly_h1  v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
+        dmbutterfly_h2  v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
+        dmbutterfly_h1  v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
+        dmbutterfly_h2  v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
+        dmbutterfly_h1  v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+        dmbutterfly_h2  v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        dsmull_h        v24, v25, v19, v3.s[3]
+        dsmull_h        v4,  v5,  v17, v2.s[0]
+        dsmull_h        v7,  v6,  v18, v1.s[1]
+        dsmull_h        v30, v31, v18, v1.s[0]
+        neg             v24.2d,  v24.2d
+        neg             v25.2d,  v25.2d
+        dsmull_h        v29, v28, v17, v2.s[1]
+        dsmull_h        v26, v27, v19, v3.s[2]
+        dsmull_h        v22, v23, v16, v0.s[0]
+        drshrn_h        v24, v24, v25, #14
+        drshrn_h        v16, v4,  v5,  #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v6,  v30, v31, #14
+        drshrn_h        v29, v29, v28, #14
+        drshrn_h        v17, v26, v27, #14
+        drshrn_h        v28, v22, v23, #14
+
+        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
+        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
+        neg             v22.2d,  v22.2d
+        neg             v23.2d,  v23.2d
+        drshrn_h        v27, v20, v21, #14
+        drshrn_h        v21, v22, v23, #14
+        drshrn_h        v23, v18, v19, #14
+        drshrn_h        v25, v30, v31, #14
+        mov             v4.16b,  v28.16b
+        mov             v5.16b,  v28.16b
+        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
+        mov             v20.16b, v28.16b
+        idct16_end
+endfunc
+
+function iadst16
+        ld1             {v0.8h,v1.8h}, [x11]
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
+        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
+        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
+        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
+        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
+        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
+        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
+        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
+
+        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
+        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
+        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
+        ld1             {v0.8h}, [x10]
+        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
+        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
+
+        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
+        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
+        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
+        butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
+        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
+
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
+        butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
+        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
+        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
+
+        butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
+        butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
+
+        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
+        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
+        neg             v29.4s, v29.4s                   // v29 = out[13]
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
+        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
+
+        butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
+        butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
+
+        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
+        neg             v19.4s, v19.4s                   // v19 = out[3]
+        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
+
+        butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
+        butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
+
+        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
+        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
+        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
+        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
+
+        neg             v31.4s,  v5.4s                    // v31 = out[15]
+        neg             v17.4s,  v3.4s                    // v17 = out[1]
+
+        mov             v16.16b, v2.16b
+        mov             v30.16b, v4.16b
+        ret
+endfunc
+
+// Helper macros; we can't use these expressions directly within
+// e.g. .irp due to the extra concatenation \(). Therefore wrap
+// them in macros to allow using .irp below.
+.macro load i, src, inc
+        ld1             {v\i\().4s},  [\src], \inc
+.endm
+.macro store i, dst, inc
+        st1             {v\i\().4s},  [\dst], \inc
+.endm
+.macro movi_v i, size, imm
+        movi            v\i\()\size,  \imm
+.endm
+.macro load_clear i, src, inc
+        ld1             {v\i\().4s}, [\src]
+        st1             {v4.4s},  [\src], \inc
+.endm
+
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
+        srshr           \coef0, \coef0, #6
+        ld1             {v4.4h},   [x0], x1
+        srshr           \coef1, \coef1, #6
+        ld1             {v4.d}[1], [x3], x1
+        srshr           \coef2, \coef2, #6
+        ld1             {v5.4h},   [x0], x1
+        srshr           \coef3, \coef3, #6
+        uaddw           \coef0, \coef0, v4.4h
+        ld1             {v5.d}[1], [x3], x1
+        srshr           \coef4, \coef4, #6
+        uaddw2          \coef1, \coef1, v4.8h
+        ld1             {v6.4h},   [x0], x1
+        srshr           \coef5, \coef5, #6
+        uaddw           \coef2, \coef2, v5.4h
+        ld1             {v6.d}[1], [x3], x1
+        sqxtun          v4.4h,  \coef0
+        srshr           \coef6, \coef6, #6
+        uaddw2          \coef3, \coef3, v5.8h
+        ld1             {v7.4h},   [x0], x1
+        sqxtun2         v4.8h,  \coef1
+        srshr           \coef7, \coef7, #6
+        uaddw           \coef4, \coef4, v6.4h
+        ld1             {v7.d}[1], [x3], x1
+        umin            v4.8h,  v4.8h,  v8.8h
+        sub             x0,  x0,  x1, lsl #2
+        sub             x3,  x3,  x1, lsl #2
+        sqxtun          v5.4h,  \coef2
+        uaddw2          \coef5, \coef5, v6.8h
+        st1             {v4.4h},   [x0], x1
+        sqxtun2         v5.8h,  \coef3
+        uaddw           \coef6, \coef6, v7.4h
+        st1             {v4.d}[1], [x3], x1
+        umin            v5.8h,  v5.8h,  v8.8h
+        sqxtun          v6.4h,  \coef4
+        uaddw2          \coef7, \coef7, v7.8h
+        st1             {v5.4h},   [x0], x1
+        sqxtun2         v6.8h,  \coef5
+        st1             {v5.d}[1], [x3], x1
+        umin            v6.8h,  v6.8h,  v8.8h
+        sqxtun          v7.4h,  \coef6
+        st1             {v6.4h},   [x0], x1
+        sqxtun2         v7.8h,  \coef7
+        st1             {v6.d}[1], [x3], x1
+        umin            v7.8h,  v7.8h,  v8.8h
+        st1             {v7.4h},   [x0], x1
+        st1             {v7.d}[1], [x3], x1
+.endm
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// transpose into a horizontal 16x4 slice and store.
+// x0 = dst (temp buffer)
+// x1 = slice offset
+// x2 = src
+// x9 = input stride
+.macro itxfm16_1d_funcs txfm
+function \txfm\()16_1d_4x16_pass1_neon
+        mov             x14, x30
+
+        movi            v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              \txfm\()16
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the transposed 4x4 blocks horizontally.
+        cmp             x1,  #12
+        b.eq            1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        store           \i,  x0,  #16
+.endr
+        br              x14
+1:
+        // Special case: For the last input column (x1 == 12),
+        // which would be stored as the last row in the temp buffer,
+        // don't store the first 4x4 block, but keep it in registers
+        // for the first slice of the second pass (where it is the
+        // last 4x4 block).
+        add             x0,  x0,  #16
+        st1             {v20.4s},  [x0], #16
+        st1             {v24.4s},  [x0], #16
+        st1             {v28.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v21.4s},  [x0], #16
+        st1             {v25.4s},  [x0], #16
+        st1             {v29.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v22.4s},  [x0], #16
+        st1             {v26.4s},  [x0], #16
+        st1             {v30.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v23.4s},  [x0], #16
+        st1             {v27.4s},  [x0], #16
+        st1             {v31.4s},  [x0], #16
+
+        mov             v28.16b, v16.16b
+        mov             v29.16b, v17.16b
+        mov             v30.16b, v18.16b
+        mov             v31.16b, v19.16b
+        br              x14
+endfunc
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// load the destination pixels (from a similar 4x16 slice), add and store back.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x3 = slice offset
+// x9 = temp buffer stride
+function \txfm\()16_1d_4x16_pass2_neon
+        mov             x14, x30
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+        load            \i,  x2,  x9
+.endr
+        cbz             x3,  1f
+.irp i, 28, 29, 30, 31
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              \txfm\()16
+
+        dup             v8.8h, w13
+        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+        br              x14
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+// This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+        .short  0, 10, 38, 89
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.eq            idct16x16_dc_add_neon
+.endif
+        mov             x15, x30
+        // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
+.ifnc \txfm1\()_\txfm2,idct_idct
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+.endif
+        stp             d8,  d9,  [sp, #-0x10]!
+
+        sub             sp,  sp,  #1024
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        movrel          x10, idct_coeffs
+.ifnc \txfm1\()_\txfm2,idct_idct
+        movrel          x11, iadst16_coeffs
+.endif
+.ifc \txfm1,idct
+        ld1             {v0.8h,v1.8h}, [x10]
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+.endif
+        mov             x9,  #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #10
+        b.le            idct16x16_quarter_add_16_neon
+        cmp             w3,  #38
+        b.le            idct16x16_half_add_16_neon
+
+        movrel          x12, min_eob_idct_idct_16, 2
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             x0,  sp,  #(\i*64)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+        ldrh            w1,  [x12], #2
+        cmp             w3,  w1
+        mov             x1,  #(16 - \i)/4
+        b.le            1f
+.endif
+.endif
+        mov             x1,  #\i
+        add             x2,  x6,  #(\i*4)
+        bl              \txfm1\()16_1d_4x16_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,iadst_idct
+        ld1             {v0.8h,v1.8h}, [x10]
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        // Set v28-v31 to zero, for the in-register passthrough of
+        // coefficients to pass 2.
+        movi            v28.4s,  #0
+        movi            v29.4s,  #0
+        movi            v30.4s,  #0
+        movi            v31.4s,  #0
+2:
+        subs            x1,  x1,  #1
+.rept 4
+        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
+.endr
+        b.ne            2b
+3:
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        mov             x3,  #\i
+        bl              \txfm2\()16_1d_4x16_pass2_neon
+.endr
+
+        add             sp,  sp,  #1024
+        ldp             d8,  d9,  [sp], 0x10
+.ifnc \txfm1\()_\txfm2,idct_idct
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+.endif
+        br              x15
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
+        mov             x13, #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
+        mov             x13, #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+.endm
+
+itxfm_func16x16 idct,  idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct,  iadst
+itxfm_func16x16 iadst, iadst
+
+function idct16_1d_4x16_pass1_quarter_neon
+        mov             x14, x30
+
+        movi            v4.4s, #0
+.irp i, 16, 17, 18, 19
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_quarter
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the transposed 4x4 blocks horizontally.
+        // The first 4x4 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        add             x0,  x0,  #16
+        st1             {v20.4s},  [x0], #16
+        st1             {v24.4s},  [x0], #16
+        st1             {v28.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v21.4s},  [x0], #16
+        st1             {v25.4s},  [x0], #16
+        st1             {v29.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v22.4s},  [x0], #16
+        st1             {v26.4s},  [x0], #16
+        st1             {v30.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v23.4s},  [x0], #16
+        st1             {v27.4s},  [x0], #16
+        st1             {v31.4s},  [x0], #16
+        br              x14
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+        mov             x14, x30
+
+        // Only load the top 4 lines, and only do it for the later slices.
+        // For the first slice, d16-d19 is kept in registers from the first pass.
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_quarter
+
+        dup             v8.8h, w13
+        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+        br              x14
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+        mov             x14, x30
+
+        movi            v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_half
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the transposed 4x4 blocks horizontally.
+        cmp             x1,  #4
+        b.eq            1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        store           \i,  x0,  #16
+.endr
+        br              x14
+1:
+        // Special case: For the second input column (r1 == 4),
+        // which would be stored as the second row in the temp buffer,
+        // don't store the first 4x4 block, but keep it in registers
+        // for the first slice of the second pass (where it is the
+        // second 4x4 block).
+        add             x0,  x0,  #16
+        st1             {v20.4s},  [x0], #16
+        st1             {v24.4s},  [x0], #16
+        st1             {v28.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v21.4s},  [x0], #16
+        st1             {v25.4s},  [x0], #16
+        st1             {v29.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v22.4s},  [x0], #16
+        st1             {v26.4s},  [x0], #16
+        st1             {v30.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v23.4s},  [x0], #16
+        st1             {v27.4s},  [x0], #16
+        st1             {v31.4s},  [x0], #16
+
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v17.16b
+        mov             v22.16b, v18.16b
+        mov             v23.16b, v19.16b
+        br              x14
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+        mov             x14, x30
+
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+        cbz             x3,  1f
+.irp i, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_half
+
+        dup             v8.8h, w13
+        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+        br              x14
+endfunc
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_16_neon
+        add             x0,  sp,  #(0*64)
+        mov             x1,  #0
+        add             x2,  x6,  #(0*4)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.ifc \size,half
+        add             x0,  sp,  #(4*64)
+        mov             x1,  #4
+        add             x2,  x6,  #(4*4)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        mov             x3,  #\i
+        bl              idct16_1d_4x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #1024
+        ldp             d8,  d9,  [sp], 0x10
+        br              x15
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+        sxtl            v0.4s,  v0.4h
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        st1             {v1.s}[0],  [x2]
+        dup             v2.4s,  v2.s[0]
+
+        srshr           v0.4s,  v2.4s,  #6
+
+        mov             x3,  x0
+        mov             x4,  #32
+        sub             x1,  x1,  #32
+        dup             v31.8h, w13
+1:
+        // Loop to add the constant v0 into all 32x32 outputs
+        subs            x4,  x4,  #1
+        ld1             {v1.8h,v2.8h},  [x0], #32
+        uaddw           v16.4s, v0.4s,  v1.4h
+        uaddw2          v17.4s, v0.4s,  v1.8h
+        ld1             {v3.8h,v4.8h},  [x0], x1
+        uaddw           v18.4s, v0.4s,  v2.4h
+        uaddw2          v19.4s, v0.4s,  v2.8h
+        uaddw           v20.4s, v0.4s,  v3.4h
+        uaddw2          v21.4s, v0.4s,  v3.8h
+        uaddw           v22.4s, v0.4s,  v4.4h
+        uaddw2          v23.4s, v0.4s,  v4.8h
+        sqxtun          v1.4h,  v16.4s
+        sqxtun2         v1.8h,  v17.4s
+        sqxtun          v2.4h,  v18.4s
+        sqxtun2         v2.8h,  v19.4s
+        sqxtun          v3.4h,  v20.4s
+        sqxtun2         v3.8h,  v21.4s
+        sqxtun          v4.4h,  v22.4s
+        sqxtun2         v4.8h,  v23.4s
+        umin            v1.8h,  v1.8h,  v31.8h
+        umin            v2.8h,  v2.8h,  v31.8h
+        st1             {v1.8h,v2.8h},  [x3], #32
+        umin            v3.8h,  v3.8h,  v31.8h
+        umin            v4.8h,  v4.8h,  v31.8h
+        st1             {v3.8h,v4.8h},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct32_end
+        butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
+        butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
+        butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
+        butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
+        butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
+        butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
+        butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
+        butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
+
+        dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
+        dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
+        dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
+        dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+
+        butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
+        butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
+        butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
+        butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
+        butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
+        butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
+        butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
+        butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
+
+        dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
+        dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
+        dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
+        dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
+        ret
+.endm
+
+function idct32_odd
+        dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_half
+        dmbutterfly_h1  v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        dsmull_h        v4,  v5,  v16, v10.s[0]
+        dsmull_h        v28, v29, v19, v11.s[3]
+        dsmull_h        v30, v31, v16, v10.s[1]
+        dsmull_h        v22, v23, v17, v13.s[2]
+        dsmull_h        v7,  v6,  v17, v13.s[3]
+        dsmull_h        v26, v27, v19, v11.s[2]
+        dsmull_h        v20, v21, v18, v12.s[0]
+        dsmull_h        v24, v25, v18, v12.s[1]
+
+        neg             v28.2d, v28.2d
+        neg             v29.2d, v29.2d
+        neg             v7.2d,  v7.2d
+        neg             v6.2d,  v6.2d
+
+        drshrn_h        v4,  v4,  v5,  #14
+        drshrn_h        v5,  v28, v29, #14
+        drshrn_h        v29, v30, v31, #14
+        drshrn_h        v28, v22, v23, #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v31, v26, v27, #14
+        drshrn_h        v6,  v20, v21, #14
+        drshrn_h        v30, v24, v25, #14
+
+        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v1.s[0], v1.s[1]
+        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v1.s[0], v1.s[1]
+        drshrn_h        v23, v16, v17, #14
+        drshrn_h        v24, v18, v19, #14
+        neg             v20.2d, v20.2d
+        neg             v21.2d, v21.2d
+        drshrn_h        v27, v27, v26, #14
+        drshrn_h        v20, v20, v21, #14
+        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v1.s[2], v1.s[3]
+        drshrn_h        v21, v16, v17, #14
+        drshrn_h        v26, v18, v19, #14
+        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v1.s[2], v1.s[3]
+        drshrn_h        v25, v16, v17, #14
+        neg             v18.2d, v18.2d
+        neg             v19.2d, v19.2d
+        drshrn_h        v22, v18, v19, #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
+// The 32-point IDCT can be decomposed into two 16-point IDCTs;
+// a normal IDCT16 with every other input component (the even ones, with
+// each output written twice), followed by a separate 16-point IDCT
+// of the odd inputs, added/subtracted onto the outputs of the first idct16.
+// x0 = dst (temp buffer)
+// x1 = unused
+// x2 = src
+// x9 = double input stride
+function idct32_1d_4x32_pass1\suffix\()_neon
+        mov             x14, x30
+
+        movi            v4.4s,  #0
+
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct16\suffix
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the registers a, b, c, d horizontally, followed by the
+        // same registers d, c, b, a mirrored.
+.macro store_rev a, b, c, d
+        // There's no rev128 instruction, but we reverse each 64 bit
+        // half, and then flip them using an ext with 8 bytes offset.
+        rev64           v7.4s, \d
+        st1             {\a},  [x0], #16
+        ext             v7.16b, v7.16b, v7.16b, #8
+        st1             {\b},  [x0], #16
+        rev64           v6.4s, \c
+        st1             {\c},  [x0], #16
+        ext             v6.16b, v6.16b, v6.16b, #8
+        st1             {\d},  [x0], #16
+        rev64           v5.4s, \b
+        st1             {v7.4s},  [x0], #16
+        ext             v5.16b, v5.16b, v5.16b, #8
+        st1             {v6.4s},  [x0], #16
+        rev64           v4.4s, \a
+        st1             {v5.4s},  [x0], #16
+        ext             v4.16b, v4.16b, v4.16b, #8
+        st1             {v4.4s},  [x0], #16
+.endm
+        store_rev       v16.4s, v20.4s, v24.4s, v28.4s
+        store_rev       v17.4s, v21.4s, v25.4s, v29.4s
+        store_rev       v18.4s, v22.4s, v26.4s, v30.4s
+        store_rev       v19.4s, v23.4s, v27.4s, v31.4s
+        sub             x0,  x0,  #512
+.purgem store_rev
+
+        // Move x2 back to the start of the input, and move
+        // to the first odd row
+.ifb \suffix
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        add             x2,  x2,  #128
+
+        movi            v4.4s,  #0
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct32_odd\suffix
+
+        transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
+        transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
+        transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
+        transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
+
+        // Store the registers a, b, c, d horizontally,
+        // adding into the output first, and the mirrored,
+        // subtracted from the output.
+.macro store_rev a, b, c, d, a16b, b16b
+        ld1             {v4.4s},  [x0]
+        rev64           v9.4s, \d
+        add             v4.4s, v4.4s, \a
+        st1             {v4.4s},  [x0], #16
+        rev64           v8.4s, \c
+        ld1             {v4.4s},  [x0]
+        ext             v9.16b, v9.16b, v9.16b, #8
+        add             v4.4s, v4.4s, \b
+        st1             {v4.4s},  [x0], #16
+        ext             v8.16b, v8.16b, v8.16b, #8
+        ld1             {v4.4s},  [x0]
+        rev64           \b, \b
+        add             v4.4s, v4.4s, \c
+        st1             {v4.4s},  [x0], #16
+        rev64           \a, \a
+        ld1             {v4.4s},  [x0]
+        ext             \b16b, \b16b, \b16b, #8
+        add             v4.4s, v4.4s, \d
+        st1             {v4.4s},  [x0], #16
+        ext             \a16b, \a16b, \a16b, #8
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, v9.4s
+        st1             {v4.4s},  [x0], #16
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, v8.4s
+        st1             {v4.4s},  [x0], #16
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, \b
+        st1             {v4.4s},  [x0], #16
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, \a
+        st1             {v4.4s},  [x0], #16
+.endm
+
+        store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
+        store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
+        store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
+        store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
+.purgem store_rev
+        br              x14
+endfunc
+
+// This is mostly the same as 4x32_pass1, but without the transpose,
+// and use the source as temp buffer between the two idct passes, and
+// add into the destination.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
+function idct32_1d_4x32_pass2\suffix\()_neon
+        mov             x14, x30
+
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+
+        bl              idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        store           \i, x2, x9
+.endr
+
+        sub             x2,  x2,  x9, lsl #4
+        add             x2,  x2,  #128
+
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        sub             x2,  x2,  #128
+
+        bl              idct32_odd\suffix
+
+.macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
+        ld1             {v4.4s},  [x2], x9
+        ld1             {v5.4s},  [x2], x9
+        add             v4.4s, v4.4s, \a
+        ld1             {v6.4s},  [x2], x9
+        add             v5.4s, v5.4s, \b
+        ld1             {v7.4s},  [x2], x9
+        add             v6.4s, v6.4s, \c
+        add             v7.4s, v7.4s, \d
+.else
+        ld1             {v4.4s},  [x2], x7
+        ld1             {v5.4s},  [x2], x7
+        sub             v4.4s, v4.4s, \a
+        ld1             {v6.4s},  [x2], x7
+        sub             v5.4s, v5.4s, \b
+        ld1             {v7.4s},  [x2], x7
+        sub             v6.4s, v6.4s, \c
+        sub             v7.4s, v7.4s, \d
+.endif
+        ld1             {v8.4h},   [x0], x1
+        ld1             {v8.d}[1], [x0], x1
+        srshr           v4.4s, v4.4s, #6
+        ld1             {v9.4h},   [x0], x1
+        srshr           v5.4s, v5.4s, #6
+        uaddw           v4.4s, v4.4s, v8.4h
+        ld1             {v9.d}[1], [x0], x1
+        srshr           v6.4s, v6.4s, #6
+        uaddw2          v5.4s, v5.4s, v8.8h
+        srshr           v7.4s, v7.4s, #6
+        sub             x0,  x0,  x1, lsl #2
+        uaddw           v6.4s, v6.4s, v9.4h
+        sqxtun          v4.4h, v4.4s
+        uaddw2          v7.4s, v7.4s, v9.8h
+        sqxtun2         v4.8h, v5.4s
+        umin            v4.8h, v4.8h, v15.8h
+        st1             {v4.4h},   [x0], x1
+        sqxtun          v5.4h, v6.4s
+        st1             {v4.d}[1], [x0], x1
+        sqxtun2         v5.8h, v7.4s
+        umin            v5.8h, v5.8h, v15.8h
+        st1             {v5.4h},   [x0], x1
+        st1             {v5.d}[1], [x0], x1
+.endm
+        load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
+        load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
+        load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
+        load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
+        sub             x2,  x2,  x9
+        load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
+        load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
+        load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
+        load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
+.purgem load_acc_store
+        br              x14
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+        .short  0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
+function vp9_idct_idct_32x32_add_16_neon
+        cmp             w3,  #1
+        b.eq            idct32x32_dc_add_neon
+
+        movrel          x10, idct_coeffs
+
+        mov             x15, x30
+        stp             d8,  d9,  [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d14, d15, [sp, #-0x10]!
+
+        sub             sp,  sp,  #4096
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        // Double stride of the input, since we only read every other line
+        mov             x9,  #256
+        neg             x7,  x9
+
+        ld1             {v0.8h,v1.8h},   [x10], #32
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+        ld1             {v10.8h,v11.8h}, [x10]
+        sxtl            v12.4s, v11.4h
+        sxtl2           v13.4s, v11.8h
+        sxtl2           v11.4s, v10.8h
+        sxtl            v10.4s, v10.4h
+
+        dup             v15.8h, w13
+
+        cmp             w3,  #34
+        b.le            idct32x32_quarter_add_16_neon
+        cmp             w3,  #135
+        b.le            idct32x32_half_add_16_neon
+
+        movrel          x12, min_eob_idct_idct_32, 2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x0,  sp,  #(\i*128)
+.if \i > 0
+        ldrh            w1,  [x12], #2
+        cmp             w3,  w1
+        mov             x1,  #(32 - \i)/4
+        b.le            1f
+.endif
+        add             x2,  x6,  #(\i*4)
+        bl              idct32_1d_4x32_pass1_neon
+.endr
+        b               3f
+
+1:
+        // Write zeros to the temp buffer for pass 2
+        movi            v16.4s,  #0
+        movi            v17.4s,  #0
+        movi            v18.4s,  #0
+        movi            v19.4s,  #0
+2:
+        subs            x1,  x1,  #1
+.rept 4
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+.endr
+        b.ne            2b
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        bl              idct32_1d_4x32_pass2_neon
+.endr
+
+        add             sp,  sp,  #4096
+        ldp             d14, d15, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d8,  d9,  [sp], 0x10
+
+        br              x15
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_10_neon, export=1
+        mov             x13, #0x03ff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_12_neon, export=1
+        mov             x13, #0x0fff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_16_neon
+.irp i, 0, 4
+        add             x0,  sp,  #(\i*128)
+.ifc \size,quarter
+.if \i == 4
+        cmp             w3,  #9
+        b.le            1f
+.endif
+.endif
+        add             x2,  x6,  #(\i*4)
+        bl              idct32_1d_4x32_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 8, 12
+        add             x0,  sp,  #(\i*128)
+.if \i == 12
+        cmp             w3,  #70
+        b.le            1f
+.endif
+        add             x2,  x6,  #(\i*4)
+        bl              idct32_1d_4x32_pass1_\size\()_neon
+.endr
+.endif
+        b               3f
+
+1:
+        // Write zeros to the temp buffer for pass 2
+        movi            v16.4s,  #0
+        movi            v17.4s,  #0
+        movi            v18.4s,  #0
+        movi            v19.4s,  #0
+
+.rept 4
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+.endr
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        bl              idct32_1d_4x32_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #4096
+        ldp             d14, d15, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d8,  d9,  [sp], 0x10
+
+        br              x15
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 3ffb418963..99413b0f70 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
new file mode 100644
index 0000000000..9075f3d406
--- /dev/null
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+
+.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
+
+        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
+        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
+        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
+        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
+.endm
+
+// The input to and output from this macro is in the registers v16-v31,
+// and v0-v7 are used as scratch registers.
+// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
+// Depending on the width of the loop filter, we either use v16-v19
+// and v28-v31 as temp registers, or v8-v15.
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+        dup             v0.8h,  w2                   // E
+        dup             v2.8h,  w3                   // I
+        dup             v3.8h,  w4                   // H
+
+        uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
+        uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
+        uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
+        uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
+        uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
+        uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
+        umax            v4.8h,  v4.8h,  v5.8h
+        umax            v5.8h,  v6.8h,  v7.8h
+        umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
+        uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
+        umax            v4.8h,  v4.8h,  v5.8h
+        add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
+        uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
+        umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
+        ushr            v5.8h,  v5.8h,  #1
+        cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
+        add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        cmhs            v6.8h,  v0.8h,  v6.8h
+        and             v4.16b, v4.16b, v6.16b       // fm
+
+        // If no pixels need filtering, just exit as soon as possible
+        mov             x11, v4.d[0]
+        mov             x12, v4.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        br              x10
+1:
+
+.if \wd >= 8
+        dup             v0.8h,  w5
+
+        uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
+        uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
+        uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
+        uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
+        uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
+        uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
+        umax            v6.8h,  v6.8h,  v2.8h
+        umax            v1.8h,  v1.8h,  \tmp1\().8h
+        umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
+.if \wd == 16
+        uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
+        umax            v6.8h,  v6.8h,  v1.8h
+        uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
+        umax            v6.8h,  v6.8h,  \tmp2\().8h
+        uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
+        cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
+        uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
+        and             v6.16b, v6.16b, v4.16b       // flat8in && fm
+        uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
+        bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
+        uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
+        uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
+        uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
+
+        umax            v7.8h,  v7.8h,  v2.8h
+        umax            v1.8h,  v1.8h,  v8.8h
+        umax            v9.8h,  v9.8h,  v10.8h
+        umax            v11.8h, v11.8h, v12.8h
+        // The rest of the calculation of flat8out is interleaved below
+.else
+        // The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+        // Calculate the normal inner loop filter for 2 or 4 pixels
+        uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
+.if \wd == 16
+        umax            v7.8h,  v7.8h,  v1.8h
+        umax            v9.8h,  v9.8h,  v11.8h
+.elseif \wd == 8
+        umax            v6.8h,  v6.8h,  v1.8h
+.endif
+        uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
+.if \wd == 16
+        umax            v7.8h,  v7.8h,  v9.8h
+.elseif \wd == 8
+        umax            v6.8h,  v6.8h,  \tmp2\().8h
+.endif
+        dup             \tmp2\().8h,  w6                        // left shift for saturation
+        sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
+        neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
+        umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
+        sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
+        movi            \tmp5\().8h,  #3
+.if \wd == 8
+        cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
+.endif
+        cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
+.if \wd == 8
+        and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
+.endif
+        sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
+.if \wd == 16
+        cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
+.elseif \wd == 8
+        bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
+.endif
+        and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
+.if \wd == 16
+        and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
+.endif
+        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+        mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
+        bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
+        movi            v2.8h,  #4
+        add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+        movi            v3.8h,  #3
+        sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
+        movi            \tmp5\().8h,  #0
+        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+        dup             \tmp6\().8h,  w7                        // max pixel value
+.if \wd == 16
+        bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
+.endif
+
+        ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
+
+        add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
+        add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
+        smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+        smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+        sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
+        sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
+
+        add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
+        sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
+        smin            v0.8h,   v0.8h,   \tmp6\().8h
+        smin            v2.8h,   v2.8h,   \tmp6\().8h
+        srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
+        smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
+        smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
+        bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
+        bit             v24.16b, v2.16b,  v4.16b
+
+        add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
+        sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
+.if \wd >= 8
+        mov             x11, v6.d[0]
+.endif
+        smin            v0.8h,  v0.8h,  \tmp6\().8h
+        smin            v2.8h,  v2.8h,  \tmp6\().8h
+.if \wd >= 8
+        mov             x12, v6.d[1]
+.endif
+        smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
+        smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
+.if \wd >= 8
+        adds            x11, x11, x12
+.endif
+        bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
+        bit             v25.16b, v2.16b,  v5.16b
+
+        // If no pixels need flat8in, jump to flat8out
+        // (or to a writeout of the inner 4 pixels, for wd=8)
+.if \wd >= 8
+.if \wd == 16
+        b.eq            6f
+.else
+        b.ne            1f
+        br              x13
+1:
+.endif
+
+        // flat8in
+        add             \tmp1\().8h, v20.8h, v21.8h
+        add             \tmp3\().8h, v22.8h, v25.8h
+        add             \tmp5\().8h, v20.8h, v22.8h
+        add             \tmp7\().8h, v23.8h, v26.8h
+        add             v0.8h,  \tmp1\().8h, \tmp1\().8h
+        add             v0.8h,  v0.8h,  v23.8h
+        add             v0.8h,  v0.8h,  v24.8h
+        add             v0.8h,  v0.8h,  \tmp5\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+        urshr           v2.8h,  v0.8h,  #3                      // out p2
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        add             \tmp1\().8h, v20.8h,  v23.8h
+        add             \tmp3\().8h, v24.8h,  v27.8h
+        urshr           v3.8h,  v0.8h,  #3                      // out p1
+
+        add             v0.8h,  v0.8h,  \tmp7\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        add             \tmp5\().8h, v21.8h,  v24.8h
+        add             \tmp7\().8h, v25.8h,  v27.8h
+        urshr           v4.8h,  v0.8h,  #3                      // out p0
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+        add             \tmp1\().8h, v22.8h,  v25.8h
+        add             \tmp3\().8h, v26.8h,  v27.8h
+        urshr           v5.8h,  v0.8h,  #3                      // out q0
+
+        add             v0.8h,  v0.8h,  \tmp7\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        // The output here is written back into the input registers. This doesn't
+        // matter for the flat8part below, since we only update those pixels
+        // which won't be touched below.
+        bit             v21.16b, v2.16b,  v6.16b
+        bit             v22.16b, v3.16b,  v6.16b
+        bit             v23.16b, v4.16b,  v6.16b
+        urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
+        bit             v24.16b, v5.16b,  v6.16b
+        bit             v25.16b, \tmp5\().16b,  v6.16b
+        bit             v26.16b, \tmp6\().16b,  v6.16b
+.endif
+.if \wd == 16
+6:
+        orr             v2.16b,  v6.16b,  v7.16b
+        mov             x11, v2.d[0]
+        mov             x12, v2.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        // If no pixels needed flat8in nor flat8out, jump to a
+        // writeout of the inner 4 pixels
+        br              x14
+1:
+
+        mov             x11, v7.d[0]
+        mov             x12, v7.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+        br              x15
+
+1:
+        // flat8out
+        // This writes all outputs into v2-v17 (skipping v6 and v16).
+        // If this part is skipped, the output is read from v21-v26 (which is the input
+        // to this section).
+        shl             v0.8h,   v16.8h,  #3     // 8 * v16
+        sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
+        add             v0.8h,   v0.8h,   v17.8h
+        add             v8.8h,   v17.8h,  v18.8h
+        add             v10.8h,  v19.8h,  v20.8h
+        add             v0.8h,   v0.8h,   v8.8h
+        add             v8.8h,   v16.8h,  v17.8h
+        add             v12.8h,  v21.8h,  v22.8h
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v10.8h,  v18.8h,  v25.8h
+        add             v14.8h,  v23.8h,  v24.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        add             v0.8h,   v0.8h,   v12.8h
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v18.8h
+        add             v14.8h,  v19.8h,  v26.8h
+        urshr           v2.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v8.8h,   v16.8h,  v19.8h
+        add             v10.8h,  v20.8h,  v27.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        bif             v2.16b,  v17.16b, v7.16b
+        urshr           v3.8h ,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v20.8h
+        add             v14.8h,  v21.8h,  v28.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        bif             v3.16b,  v18.16b, v7.16b
+        urshr           v4.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v8.8h,   v16.8h,  v21.8h
+        add             v10.8h,  v22.8h,  v29.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        bif             v4.16b,  v19.16b, v7.16b
+        urshr           v5.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v22.8h
+        add             v14.8h,  v23.8h,  v30.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        bif             v5.16b,  v20.16b, v7.16b
+        urshr           v6.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v10.8h,  v16.8h,  v23.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        add             v12.8h,  v24.8h,  v31.8h
+        bif             v6.16b,  v21.16b, v7.16b
+        urshr           v8.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        sub             v10.8h,  v12.8h,  v10.8h
+        add             v12.8h,  v17.8h,  v24.8h
+        add             v14.8h,  v25.8h,  v31.8h
+        bif             v8.16b,  v22.16b, v7.16b
+        urshr           v9.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        add             v12.8h,  v26.8h,  v31.8h
+        bif             v9.16b,  v23.16b, v7.16b
+        urshr           v10.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v14.8h,  v18.8h,  v25.8h
+        add             v18.8h,  v19.8h,  v26.8h
+        sub             v12.8h,  v12.8h,  v14.8h
+        add             v14.8h,  v27.8h,  v31.8h
+        bif             v10.16b, v24.16b, v7.16b
+        urshr           v11.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v12.8h
+        add             v12.8h,  v20.8h,  v27.8h
+        sub             v14.8h,  v14.8h,  v18.8h
+        add             v18.8h,  v28.8h,  v31.8h
+        bif             v11.16b, v25.16b, v7.16b
+        sub             v18.8h,  v18.8h,  v12.8h
+        urshr           v12.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v14.8h,  v21.8h,  v28.8h
+        add             v20.8h,  v29.8h,  v31.8h
+        bif             v12.16b, v26.16b, v7.16b
+        urshr           v13.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v18.8h
+        sub             v20.8h,  v20.8h,  v14.8h
+        add             v18.8h,  v22.8h,  v29.8h
+        add             v22.8h,  v30.8h,  v31.8h
+        bif             v13.16b, v27.16b, v7.16b
+        urshr           v14.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v20.8h
+        sub             v22.8h,  v22.8h,  v18.8h
+        bif             v14.16b, v28.16b, v7.16b
+        urshr           v15.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v22.8h
+        bif             v15.16b, v29.16b, v7.16b
+        urshr           v17.8h,  v0.8h,   #4
+        bif             v17.16b, v30.16b, v7.16b
+.endif
+.endm
+
+// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
+// while we need those for inputs/outputs in wd=16 and use v8-v15
+// for temp registers there instead.
+function vp9_loop_filter_4
+        loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
+        ret
+endfunc
+
+function vp9_loop_filter_8
+        loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
+        ret
+endfunc
+
+function vp9_loop_filter_16
+        loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
+        ret
+endfunc
+
+.macro loop_filter_4
+        bl              vp9_loop_filter_4
+.endm
+
+.macro loop_filter_8
+        // calculate alternative 'return' targets
+        adr             x13, 6f
+        bl              vp9_loop_filter_8
+.endm
+
+.macro loop_filter_16
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
+        bl              vp9_loop_filter_16
+.endm
+
+
+// The public functions in this file have got the following signature:
+// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+.macro bpp_frontend func, bpp, push
+function ff_\func\()_\bpp\()_neon, export=1
+.if \push
+        mov             x16, x30
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+.endif
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+.if \push
+        bl              \func\()_16_neon
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+        br              x16
+.else
+        b               \func\()_16_neon
+.endif
+endfunc
+.endm
+
+.macro bpp_frontends func, push=0
+        bpp_frontend    \func, 10, \push
+        bpp_frontend    \func, 12, \push
+.endm
+
+.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
+function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
+        mov             x16, x30
+.if \push
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+.endif
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+        bl              \func\()_\int_suffix\()_16_neon
+.ifc \dir,h
+        add             x0,  x0,  x1, lsl #3
+.else
+        add             x0,  x0,  #16
+.endif
+        bl              \func\()_\int_suffix\()_16_neon
+.if \push
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+.endif
+        br              x16
+endfunc
+.endm
+
+.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
+        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
+        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
+.endm
+
+.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
+function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
+        mov             x16, x30
+        lsr             w8,  w2,  #8
+        lsr             w14, w3,  #8
+        lsr             w15, w4,  #8
+        and             w2,  w2,  #0xff
+        and             w3,  w3,  #0xff
+        and             w4,  w4,  #0xff
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
+.ifc \dir,h
+        add             x0,  x0,  x1, lsl #3
+.else
+        add             x0,  x0,  #16
+.endif
+        lsl             w2,  w8,  #\bpp - 8
+        lsl             w3,  w14, #\bpp - 8
+        lsl             w4,  w15, #\bpp - 8
+        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
+        br              x16
+endfunc
+.endm
+
+.macro bpp_frontends_mix2 wd1, wd2
+        bpp_frontend_mix2 \wd1, \wd2, v, 10
+        bpp_frontend_mix2 \wd1, \wd2, v, 12
+        bpp_frontend_mix2 \wd1, \wd2, h, 10
+        bpp_frontend_mix2 \wd1, \wd2, h, 12
+.endm
+
+function vp9_loop_filter_v_4_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v27.8h}, [x0], x1 // q3
+        sub             x0,  x0,  x1, lsl #2
+        sub             x9,  x9,  x1, lsl #1
+
+        loop_filter_4
+
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_4_8
+
+function vp9_loop_filter_h_4_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        loop_filter_4
+
+        // Move x9 forward by 2 pixels; we don't need to rewrite the
+        // outermost 2 pixels since they aren't changed.
+        add             x9,  x9,  #4
+        add             x0,  x9,  x1, lsl #2
+
+        // We only will write the mid 4 pixels back; after the loop filter,
+        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
+        // We need to transpose them to columns, done with a 4x8 transpose
+        // (which in practice is two 4x4 transposes of the two 4x4 halves
+        // of the 8x4 pixels; into 4x8 pixels).
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_4_8
+
+function vp9_loop_filter_v_8_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v27.8h}, [x0], x1 // q3
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #2
+        add             x9,  x9,  x1
+
+        loop_filter_8
+
+        st1             {v21.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+
+        br              x10
+6:
+        sub             x9,  x0,  x1, lsl #1
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_8_8
+
+function vp9_loop_filter_h_8_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        loop_filter_8
+
+        add             x0,  x9,  x1, lsl #2
+
+        // Even though only 6 pixels per row have been changed, we write the
+        // full 8 pixel registers.
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        st1             {v20.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v21.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v27.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        br              x10
+6:
+        // If we didn't need to do the flat8in part, we use the same writeback
+        // as in loop_filter_h_4_8.
+        add             x9,  x9,  #4
+        add             x0,  x9,  x1, lsl #2
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_8_8
+
+bpp_frontends_mix2 4, 4
+bpp_frontends_mix2 4, 8
+bpp_frontends_mix2 8, 4
+bpp_frontends_mix2 8, 8
+
+function vp9_loop_filter_v_16_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #3
+        ld1             {v16.8h}, [x9], x1 // p7
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v17.8h}, [x9], x1 // p6
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v18.8h}, [x9], x1 // p5
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v19.8h}, [x9], x1 // p4
+        ld1             {v27.8h}, [x0], x1 // q3
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v28.8h}, [x0], x1 // q4
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v29.8h}, [x0], x1 // q5
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v30.8h}, [x0], x1 // q6
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v31.8h}, [x0], x1 // q7
+        sub             x9,  x9,  x1, lsl #3
+        sub             x0,  x0,  x1, lsl #3
+        add             x9,  x9,  x1
+
+        loop_filter_16
+
+        // If we did the flat8out part, we get the output in
+        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
+        // store v2-v9 there, and v10-v17 into x0.
+        st1             {v2.8h},  [x9], x1
+        st1             {v10.8h}, [x0], x1
+        st1             {v3.8h},  [x9], x1
+        st1             {v11.8h}, [x0], x1
+        st1             {v4.8h},  [x9], x1
+        st1             {v12.8h}, [x0], x1
+        st1             {v5.8h},  [x9], x1
+        st1             {v13.8h}, [x0], x1
+        st1             {v6.8h},  [x9], x1
+        st1             {v14.8h}, [x0], x1
+        st1             {v8.8h},  [x9], x1
+        st1             {v15.8h}, [x0], x1
+        st1             {v9.8h},  [x9], x1
+        st1             {v17.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  x1
+
+        br              x10
+8:
+        add             x9,  x9,  x1, lsl #2
+        // If we didn't do the flat8out part, the output is left in the
+        // input registers.
+        st1             {v21.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x10
+7:
+        sub             x9,  x0,  x1, lsl #1
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
+
+function vp9_loop_filter_h_16_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #16
+        ld1             {v16.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v17.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v18.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v19.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v28.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v29.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v30.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v31.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        sub             x9,  x9,  x1, lsl #3
+
+        // The 16x8 pixels read above is in two 8x8 blocks; the left
+        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
+        // of this, to get one column per register.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+        loop_filter_16
+
+        transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
+        transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
+
+        st1             {v16.8h}, [x9], x1
+        st1             {v10.8h}, [x0], x1
+        st1             {v2.8h},  [x9], x1
+        st1             {v11.8h}, [x0], x1
+        st1             {v3.8h},  [x9], x1
+        st1             {v12.8h}, [x0], x1
+        st1             {v4.8h},  [x9], x1
+        st1             {v13.8h}, [x0], x1
+        st1             {v5.8h},  [x9], x1
+        st1             {v14.8h}, [x0], x1
+        st1             {v6.8h},  [x9], x1
+        st1             {v15.8h}, [x0], x1
+        st1             {v8.8h},  [x9], x1
+        st1             {v17.8h}, [x0], x1
+        st1             {v9.8h},  [x9], x1
+        st1             {v31.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+
+        br              x10
+8:
+        // The same writeback as in loop_filter_h_8_8
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        st1             {v20.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v21.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v27.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+        br              x10
+7:
+        // The same writeback as in loop_filter_h_4_8
+        sub             x9,  x0,  #4
+        add             x0,  x9,  x1, lsl #2
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index e9c497096b..0878763020 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/aarch64/vp9mc_16bpp_neon.S b/libavcodec/aarch64/vp9mc_16bpp_neon.S
new file mode 100644
index 0000000000..cac6428709
--- /dev/null
+++ b/libavcodec/aarch64/vp9mc_16bpp_neon.S
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// All public functions in this file have the following signature:
+// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+//                            const uint8_t *ref, ptrdiff_t ref_stride,
+//                            int h, int mx, int my);
+
+function ff_vp9_copy128_aarch64, export=1
+1:
+        ldp             x5,  x6,  [x2]
+        ldp             x7,  x8,  [x2, #16]
+        stp             x5,  x6,  [x0]
+        ldp             x9,  x10, [x2, #32]
+        stp             x7,  x8,  [x0, #16]
+        subs            w4,  w4,  #1
+        ldp             x11, x12, [x2, #48]
+        stp             x9,  x10, [x0, #32]
+        stp             x11, x12, [x0, #48]
+        ldp             x5,  x6,  [x2, #64]
+        ldp             x7,  x8,  [x2, #80]
+        stp             x5,  x6,  [x0, #64]
+        ldp             x9,  x10, [x2, #96]
+        stp             x7,  x8,  [x0, #80]
+        ldp             x11, x12, [x2, #112]
+        stp             x9,  x10, [x0, #96]
+        stp             x11, x12, [x0, #112]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg64_16_neon, export=1
+        mov             x5,  x0
+        sub             x1,  x1,  #64
+        sub             x3,  x3,  #64
+1:
+        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], #64
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+        urhadd          v0.8h,  v0.8h,  v4.8h
+        urhadd          v1.8h,  v1.8h,  v5.8h
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+        urhadd          v2.8h,  v2.8h,  v6.8h
+        urhadd          v3.8h,  v3.8h,  v7.8h
+        subs            w4,  w4,  #1
+        urhadd          v16.8h, v16.8h, v20.8h
+        urhadd          v17.8h, v17.8h, v21.8h
+        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], #64
+        urhadd          v18.8h, v18.8h, v22.8h
+        urhadd          v19.8h, v19.8h, v23.8h
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg32_16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], x3
+        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], x1
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+        urhadd          v0.8h,  v0.8h,  v4.8h
+        urhadd          v1.8h,  v1.8h,  v5.8h
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+        urhadd          v2.8h,  v2.8h,  v6.8h
+        urhadd          v3.8h,  v3.8h,  v7.8h
+        subs            w4,  w4,  #2
+        urhadd          v16.8h, v16.8h, v20.8h
+        urhadd          v17.8h, v17.8h, v21.8h
+        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], x1
+        urhadd          v18.8h, v18.8h, v22.8h
+        urhadd          v19.8h, v19.8h, v23.8h
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg16_16_neon, export=1
+1:
+        ld1             {v2.8h, v3.8h},  [x2], x3
+        ld1             {v0.8h, v1.8h},  [x0]
+        urhadd          v0.8h,  v0.8h,  v2.8h
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        subs            w4,  w4,  #1
+        st1             {v0.8h, v1.8h},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg8_16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v2.8h},  [x2], x3
+        ld1             {v0.8h},  [x0], x1
+        ld1             {v3.8h},  [x2], x3
+        urhadd          v0.8h,  v0.8h,  v2.8h
+        ld1             {v1.8h},  [x0], x1
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        subs            w4,  w4,  #2
+        st1             {v0.8h},  [x5], x1
+        st1             {v1.8h},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg4_16_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v2.4h},  [x2], x3
+        ld1             {v0.4h},  [x0], x1
+        ld1             {v3.4h},  [x2], x3
+        urhadd          v0.4h,  v0.4h,  v2.4h
+        ld1             {v1.4h},  [x0], x1
+        urhadd          v1.4h,  v1.4h,  v3.4h
+        subs            w4,  w4,  #2
+        st1             {v0.4h},  [x5], x1
+        st1             {v1.8b},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+
+// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
+// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
+// for size >= 16)
+.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
+        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
+        smlal           \dst1\().4s, v20.4h, v0.h[\offset]
+        smlal           \dst5\().4s, v22.4h, v0.h[\offset]
+.if \size >= 16
+        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+.endif
+.if \size >= 8
+        smlal2          \dst2\().4s, v20.8h, v0.h[\offset]
+        smlal2          \dst6\().4s, v22.8h, v0.h[\offset]
+.endif
+.if \size >= 16
+        smlal           \dst3\().4s, v21.4h, v0.h[\offset]
+        smlal           \dst7\().4s, v23.4h, v0.h[\offset]
+        smlal2          \dst4\().4s, v21.8h, v0.h[\offset]
+        smlal2          \dst8\().4s, v23.8h, v0.h[\offset]
+.endif
+.endm
+
+
+// Instantiate a horizontal filter function for the given size.
+// This can work on 4, 8 or 16 pixels in parallel; for larger
+// widths it will do 16 pixels at a time and loop horizontally.
+// The actual width (in bytes) is passed in x5, the height in w4 and
+// the filter coefficients in x9.
+.macro do_8tap_h type, size
+function \type\()_8tap_\size\()h
+        sub             x2,  x2,  #6
+        add             x6,  x0,  x1
+        add             x7,  x2,  x3
+        add             x1,  x1,  x1
+        add             x3,  x3,  x3
+        // Only size >= 16 loops horizontally and needs
+        // reduced dst stride
+.if \size >= 16
+        sub             x1,  x1,  x5
+.endif
+        // size >= 16 loads two qwords and increments r2,
+        // for size 4/8 it's enough with one qword and no
+        // postincrement
+.if \size >= 16
+        sub             x3,  x3,  x5
+        sub             x3,  x3,  #16
+.endif
+        // Load the filter vector
+        ld1             {v0.8h},  [x9]
+1:
+.if \size >= 16
+        mov             x9,  x5
+.endif
+        // Load src
+.if \size >= 16
+        ld1             {v5.8h,  v6.8h,  v7.8h},  [x2], #48
+        ld1             {v16.8h, v17.8h, v18.8h}, [x7], #48
+.else
+        ld1             {v5.8h,  v6.8h},  [x2]
+        ld1             {v16.8h, v17.8h}, [x7]
+.endif
+2:
+
+        smull           v1.4s,  v5.4h,  v0.h[0]
+        smull           v24.4s, v16.4h, v0.h[0]
+.if \size >= 8
+        smull2          v2.4s,  v5.8h,  v0.h[0]
+        smull2          v25.4s, v16.8h, v0.h[0]
+.endif
+.if \size >= 16
+        smull           v3.4s,  v6.4h,  v0.h[0]
+        smull           v26.4s, v17.4h, v0.h[0]
+        smull2          v4.4s,  v6.8h,  v0.h[0]
+        smull2          v27.4s, v17.8h, v0.h[0]
+.endif
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 1, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 2, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 3, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 4, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 5, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 6, \size
+        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 7, \size
+
+        // Round, shift and saturate
+        // The sqrshrun takes care of clamping negative values to zero, but
+        // we manually need to do umin with the max pixel value.
+        sqrshrun        v1.4h,  v1.4s,  #7
+        sqrshrun        v24.4h, v24.4s, #7
+.if \size >= 8
+        sqrshrun2       v1.8h,  v2.4s,  #7
+        sqrshrun2       v24.8h, v25.4s, #7
+        umin            v1.8h,  v1.8h,  v31.8h
+        umin            v24.8h, v24.8h, v31.8h
+.if \size >= 16
+        sqrshrun        v2.4h,  v3.4s,  #7
+        sqrshrun        v25.4h, v26.4s, #7
+        sqrshrun2       v2.8h,  v4.4s,  #7
+        sqrshrun2       v25.8h, v27.4s, #7
+        umin            v2.8h,  v2.8h,  v31.8h
+        umin            v25.8h, v25.8h, v31.8h
+.endif
+.else
+        umin            v1.4h,  v1.4h,  v31.4h
+        umin            v24.4h, v24.4h, v31.4h
+.endif
+        // Average
+.ifc \type,avg
+.if \size >= 16
+        ld1             {v3.8h,  v4.8h},  [x0]
+        ld1             {v29.8h, v30.8h}, [x6]
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        urhadd          v2.8h,  v2.8h,  v4.8h
+        urhadd          v24.8h, v24.8h, v29.8h
+        urhadd          v25.8h, v25.8h, v30.8h
+.elseif \size >= 8
+        ld1             {v3.8h},  [x0]
+        ld1             {v4.8h},  [x6]
+        urhadd          v1.8h,  v1.8h,  v3.8h
+        urhadd          v24.8h, v24.8h, v4.8h
+.else
+        ld1             {v3.4h},  [x0]
+        ld1             {v4.4h},  [x6]
+        urhadd          v1.4h,  v1.4h,  v3.4h
+        urhadd          v24.4h, v24.4h, v4.4h
+.endif
+.endif
+        // Store and loop horizontally (for size >= 16)
+.if \size >= 16
+        subs            x9,  x9,  #32
+        st1             {v1.8h,  v2.8h},  [x0], #32
+        st1             {v24.8h, v25.8h}, [x6], #32
+        b.eq            3f
+        mov             v5.16b,  v7.16b
+        mov             v16.16b, v18.16b
+        ld1             {v6.8h,  v7.8h},  [x2], #32
+        ld1             {v17.8h, v18.8h}, [x7], #32
+        b               2b
+.elseif \size == 8
+        st1             {v1.8h},  [x0]
+        st1             {v24.8h}, [x6]
+.else // \size == 4
+        st1             {v1.4h},  [x0]
+        st1             {v24.4h}, [x6]
+.endif
+3:
+        // Loop vertically
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x2,  x2,  x3
+        add             x7,  x7,  x3
+        subs            w4,  w4,  #2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size
+do_8tap_h avg, \size
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
+        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
+        cmp             w5,  #8
+        add             x9,  x6,  w5, uxtw #4
+        mov             x5,  #2*\size
+.if \size >= 16
+        b               \type\()_8tap_16h
+.else
+        b               \type\()_8tap_\size\()h
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size, bpp
+do_8tap_h_func put, regular, 1, \size, \bpp
+do_8tap_h_func avg, regular, 1, \size, \bpp
+do_8tap_h_func put, sharp,   2, \size, \bpp
+do_8tap_h_func avg, sharp,   2, \size, \bpp
+do_8tap_h_func put, smooth,  0, \size, \bpp
+do_8tap_h_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_h_filters_bpp bpp
+do_8tap_h_filters 64, \bpp
+do_8tap_h_filters 32, \bpp
+do_8tap_h_filters 16, \bpp
+do_8tap_h_filters 8,  \bpp
+do_8tap_h_filters 4,  \bpp
+.endm
+
+do_8tap_h_filters_bpp 10
+do_8tap_h_filters_bpp 12
+
+
+// Vertical filters
+
+// Round, shift and saturate and store reg1-reg4
+.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
+        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
+        sqrshrun        \reg2\().4h,  \reg2\().4s, #7
+        sqrshrun        \reg3\().4h,  \reg3\().4s, #7
+        sqrshrun        \reg4\().4h,  \reg4\().4s, #7
+.ifc \type,avg
+        ld1             {\tmp1\().4h},  [x7], x1
+        ld1             {\tmp2\().4h},  [x7], x1
+        ld1             {\tmp3\().4h},  [x7], x1
+        ld1             {\tmp4\().4h},  [x7], x1
+.endif
+        umin            \reg1\().4h,  \reg1\().4h,  \minreg\().4h
+        umin            \reg2\().4h,  \reg2\().4h,  \minreg\().4h
+        umin            \reg3\().4h,  \reg3\().4h,  \minreg\().4h
+        umin            \reg4\().4h,  \reg4\().4h,  \minreg\().4h
+.ifc \type,avg
+        urhadd          \reg1\().4h,  \reg1\().4h,  \tmp1\().4h
+        urhadd          \reg2\().4h,  \reg2\().4h,  \tmp2\().4h
+        urhadd          \reg3\().4h,  \reg3\().4h,  \tmp3\().4h
+        urhadd          \reg4\().4h,  \reg4\().4h,  \tmp4\().4h
+.endif
+        st1             {\reg1\().4h},  [x0], x1
+        st1             {\reg2\().4h},  [x0], x1
+        st1             {\reg3\().4h},  [x0], x1
+        st1             {\reg4\().4h},  [x0], x1
+.endm
+
+// Round, shift and saturate and store reg1-8, where
+// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
+.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
+        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
+        sqrshrun2       \reg1\().8h,  \reg2\().4s, #7
+        sqrshrun        \reg2\().4h,  \reg3\().4s, #7
+        sqrshrun2       \reg2\().8h,  \reg4\().4s, #7
+        sqrshrun        \reg3\().4h,  \reg5\().4s, #7
+        sqrshrun2       \reg3\().8h,  \reg6\().4s, #7
+        sqrshrun        \reg4\().4h,  \reg7\().4s, #7
+        sqrshrun2       \reg4\().8h,  \reg8\().4s, #7
+.ifc \type,avg
+        ld1             {\reg5\().8h},  [x7], x1
+        ld1             {\reg6\().8h},  [x7], x1
+        ld1             {\reg7\().8h},  [x7], x1
+        ld1             {\reg8\().8h},  [x7], x1
+.endif
+        umin            \reg1\().8h,  \reg1\().8h,  \minreg\().8h
+        umin            \reg2\().8h,  \reg2\().8h,  \minreg\().8h
+        umin            \reg3\().8h,  \reg3\().8h,  \minreg\().8h
+        umin            \reg4\().8h,  \reg4\().8h,  \minreg\().8h
+.ifc \type,avg
+        urhadd          \reg1\().8h,  \reg1\().8h,  \reg5\().8h
+        urhadd          \reg2\().8h,  \reg2\().8h,  \reg6\().8h
+        urhadd          \reg3\().8h,  \reg3\().8h,  \reg7\().8h
+        urhadd          \reg4\().8h,  \reg4\().8h,  \reg8\().8h
+.endif
+        st1             {\reg1\().8h},  [x0], x1
+        st1             {\reg2\().8h},  [x0], x1
+        st1             {\reg3\().8h},  [x0], x1
+        st1             {\reg4\().8h},  [x0], x1
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+// (src1-src8 into dst1, src2-src9 into dst2).
+.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
+        smull           \dst1\().4s, \src1\().4h, v0.h[0]
+        smull           \dst2\().4s, \src2\().4h, v0.h[0]
+        smull           \tmp1\().4s, \src2\().4h, v0.h[1]
+        smull           \tmp2\().4s, \src3\().4h, v0.h[1]
+        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
+        smlal           \dst2\().4s, \src4\().4h, v0.h[2]
+        smlal           \tmp1\().4s, \src4\().4h, v0.h[3]
+        smlal           \tmp2\().4s, \src5\().4h, v0.h[3]
+        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
+        smlal           \dst2\().4s, \src6\().4h, v0.h[4]
+        smlal           \tmp1\().4s, \src6\().4h, v0.h[5]
+        smlal           \tmp2\().4s, \src7\().4h, v0.h[5]
+        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
+        smlal           \dst2\().4s, \src8\().4h, v0.h[6]
+        smlal           \tmp1\().4s, \src8\().4h, v0.h[7]
+        smlal           \tmp2\().4s, \src9\().4h, v0.h[7]
+        add             \dst1\().4s, \dst1\().4s, \tmp1\().4s
+        add             \dst2\().4s, \dst2\().4s, \tmp2\().4s
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
+// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
+.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
+        smull           \dst1\().4s, \src1\().4h, v0.h[0]
+        smull2          \dst2\().4s, \src1\().8h, v0.h[0]
+        smull           \dst3\().4s, \src2\().4h, v0.h[0]
+        smull2          \dst4\().4s, \src2\().8h, v0.h[0]
+        smlal           \dst1\().4s, \src2\().4h, v0.h[1]
+        smlal2          \dst2\().4s, \src2\().8h, v0.h[1]
+        smlal           \dst3\().4s, \src3\().4h, v0.h[1]
+        smlal2          \dst4\().4s, \src3\().8h, v0.h[1]
+        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
+        smlal2          \dst2\().4s, \src3\().8h, v0.h[2]
+        smlal           \dst3\().4s, \src4\().4h, v0.h[2]
+        smlal2          \dst4\().4s, \src4\().8h, v0.h[2]
+        smlal           \dst1\().4s, \src4\().4h, v0.h[3]
+        smlal2          \dst2\().4s, \src4\().8h, v0.h[3]
+        smlal           \dst3\().4s, \src5\().4h, v0.h[3]
+        smlal2          \dst4\().4s, \src5\().8h, v0.h[3]
+        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
+        smlal2          \dst2\().4s, \src5\().8h, v0.h[4]
+        smlal           \dst3\().4s, \src6\().4h, v0.h[4]
+        smlal2          \dst4\().4s, \src6\().8h, v0.h[4]
+        smlal           \dst1\().4s, \src6\().4h, v0.h[5]
+        smlal2          \dst2\().4s, \src6\().8h, v0.h[5]
+        smlal           \dst3\().4s, \src7\().4h, v0.h[5]
+        smlal2          \dst4\().4s, \src7\().8h, v0.h[5]
+        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
+        smlal2          \dst2\().4s, \src7\().8h, v0.h[6]
+        smlal           \dst3\().4s, \src8\().4h, v0.h[6]
+        smlal2          \dst4\().4s, \src8\().8h, v0.h[6]
+        smlal           \dst1\().4s, \src8\().4h, v0.h[7]
+        smlal2          \dst2\().4s, \src8\().8h, v0.h[7]
+        smlal           \dst3\().4s, \src9\().4h, v0.h[7]
+        smlal2          \dst4\().4s, \src9\().8h, v0.h[7]
+.endm
+
+// Instantiate a vertical filter function for filtering 8 pixels at a time.
+// The height is passed in x4, the width in x5 and the filter coefficients
+// in x6.
+.macro do_8tap_8v type
+function \type\()_8tap_8v
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+1:
+.ifc \type,avg
+        mov             x7,  x0
+.endif
+        mov             x6,  x4
+
+        ld1             {v17.8h}, [x2], x3
+        ld1             {v18.8h}, [x2], x3
+        ld1             {v19.8h}, [x2], x3
+        ld1             {v20.8h}, [x2], x3
+        ld1             {v21.8h}, [x2], x3
+        ld1             {v22.8h}, [x2], x3
+        ld1             {v23.8h}, [x2], x3
+2:
+        ld1             {v24.8h}, [x2], x3
+        ld1             {v25.8h}, [x2], x3
+        ld1             {v26.8h}, [x2], x3
+        ld1             {v27.8h}, [x2], x3
+
+        convolve8       v2,  v3,  v4,  v5,  v17, v18, v19, v20, v21, v22, v23, v24, v25
+        convolve8       v6,  v7,  v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
+        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v30, v31, v1,  \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        ld1             {v16.8h}, [x2], x3
+        ld1             {v17.8h}, [x2], x3
+        ld1             {v18.8h}, [x2], x3
+        ld1             {v19.8h}, [x2], x3
+        convolve8       v2,  v3,  v4,  v5,  v21, v22, v23, v24, v25, v26, v27, v16, v17
+        convolve8       v6,  v7,  v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
+        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v20, v21, v1,  \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        ld1             {v20.8h}, [x2], x3
+        ld1             {v21.8h}, [x2], x3
+        ld1             {v22.8h}, [x2], x3
+        ld1             {v23.8h}, [x2], x3
+        convolve8       v2,  v3,  v4,  v5,  v25, v26, v27, v16, v17, v18, v19, v20, v21
+        convolve8       v6,  v7,  v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
+        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v24, v25, v1,  \type
+
+        subs            x6,  x6,  #4
+        b.ne            2b
+
+8:
+        subs            x5,  x5,  #8
+        b.eq            9f
+        // x0 -= h * dst_stride
+        msub            x0,  x1,  x4, x0
+        // x2 -= h * src_stride
+        msub            x2,  x3,  x4, x2
+        // x2 -= 8 * src_stride
+        sub             x2,  x2,  x3, lsl #3
+        // x2 += 1 * src_stride
+        add             x2,  x2,  x3
+        add             x2,  x2,  #16
+        add             x0,  x0,  #16
+        b               1b
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_8v put
+do_8tap_8v avg
+
+
+// Instantiate a vertical filter function for filtering a 4 pixels wide
+// slice. This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type
+function \type\()_8tap_4v
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+.ifc \type,avg
+        mov             x7,  x0
+.endif
+
+        ld1             {v16.4h}, [x2], x3
+        ld1             {v17.4h}, [x2], x3
+        ld1             {v18.4h}, [x2], x3
+        ld1             {v19.4h}, [x2], x3
+        ld1             {v20.4h}, [x2], x3
+        ld1             {v21.4h}, [x2], x3
+        ld1             {v22.4h}, [x2], x3
+        ld1             {v23.4h}, [x2], x3
+        ld1             {v24.4h}, [x2], x3
+        ld1             {v25.4h}, [x2], x3
+        ld1             {v26.4h}, [x2], x3
+
+        convolve4       v2,  v3,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
+        convolve4       v4,  v5,  v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
+        do_store4       v2,  v3,  v4,  v5,  v28, v29, v30, v31, v1,  \type
+
+        subs            x4,  x4,  #4
+        b.eq            9f
+
+        ld1             {v27.4h}, [x2], x3
+        ld1             {v28.4h}, [x2], x3
+        ld1             {v29.4h}, [x2], x3
+        ld1             {v30.4h}, [x2], x3
+
+        convolve4       v2,  v3,  v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
+        convolve4       v4,  v5,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
+        do_store4       v2,  v3,  v4,  v5,  v16, v17, v18, v19, v1,  \type
+
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_4v put
+do_8tap_4v avg
+
+
+.macro do_8tap_v_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
+        uxtw            x4,  w4
+        mvni            v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
+        add             x6,  x5,  w6, uxtw #4
+        mov             x5,  #\size
+.if \size >= 8
+        b               \type\()_8tap_8v
+.else
+        b               \type\()_8tap_4v
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size, bpp
+do_8tap_v_func put, regular, 1, \size, \bpp
+do_8tap_v_func avg, regular, 1, \size, \bpp
+do_8tap_v_func put, sharp,   2, \size, \bpp
+do_8tap_v_func avg, sharp,   2, \size, \bpp
+do_8tap_v_func put, smooth,  0, \size, \bpp
+do_8tap_v_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_v_filters_bpp bpp
+do_8tap_v_filters 64, \bpp
+do_8tap_v_filters 32, \bpp
+do_8tap_v_filters 16, \bpp
+do_8tap_v_filters 8,  \bpp
+do_8tap_v_filters 4,  \bpp
+.endm
+
+do_8tap_v_filters_bpp 10
+do_8tap_v_filters_bpp 12
diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index 584c114269..f67624ca04 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2016 Google Inc.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -269,8 +269,7 @@ function \type\()_8tap_\size\()h_\idx1\idx2
         sub             x3,  x3,  #8
 .endif
         // Load the filter vector
-        ld1             {v0.8b},  [x9]
-        sxtl            v0.8h,  v0.8b
+        ld1             {v0.8h},  [x9]
 1:
 .if \size >= 16
         mov             x9,  x5
@@ -384,9 +383,9 @@ do_8tap_h_size 16
 
 .macro do_8tap_h_func type, filter, offset, size
 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
-        movrel          x6,  X(ff_vp9_subpel_filters), 120*\offset - 8
+        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
         cmp             w5,  #8
-        add             x9,  x6,  w5, uxtw #3
+        add             x9,  x6,  w5, uxtw #4
         mov             x5,  #\size
 .if \size >= 16
         b.ge            \type\()_8tap_16h_34
@@ -516,8 +515,7 @@ do_8tap_h_filters 4
 function \type\()_8tap_8v_\idx1\idx2
         sub             x2,  x2,  x3, lsl #1
         sub             x2,  x2,  x3
-        ld1             {v0.8b},  [x6]
-        sxtl            v0.8h,  v0.8b
+        ld1             {v0.8h},  [x6]
 1:
 .ifc \type,avg
         mov             x7,  x0
@@ -590,8 +588,7 @@ do_8tap_8v avg, 4, 3
 function \type\()_8tap_4v_\idx1\idx2
         sub             x2,  x2,  x3, lsl #1
         sub             x2,  x2,  x3
-        ld1             {v0.8b},  [x6]
-        sxtl            v0.8h,  v0.8b
+        ld1             {v0.8h},  [x6]
 .ifc \type,avg
         mov             x7,  x0
 .endif
@@ -660,9 +657,9 @@ do_8tap_4v avg, 4, 3
 .macro do_8tap_v_func type, filter, offset, size
 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
         uxtw            x4,  w4
-        movrel          x5,  X(ff_vp9_subpel_filters), 120*\offset - 8
+        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
         cmp             w6,  #8
-        add             x6,  x5,  w6, uxtw #3
+        add             x6,  x5,  w6, uxtw #4
         mov             x5,  #\size
 .if \size >= 8
         b.ge            \type\()_8tap_8v_34