diff options
Diffstat (limited to 'libavcodec/aarch64')
56 files changed, 6278 insertions, 1027 deletions
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 5c1d118383..8bc8bc528c 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -6,18 +6,21 @@ OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o -OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_init.o -OBJS-$(CONFIG_MDCT) += aarch64/mdct_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o +OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o # decoders/encoders -OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_init.o +OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_init_aarch64.o \ + aarch64/sbrdsp_init_aarch64.o +OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_init.o OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o -OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o +OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_init_aarch64.o OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o -OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_aarch64.o +OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_10bpp_aarch64.o \ + aarch64/vp9dsp_init_12bpp_aarch64.o \ + aarch64/vp9dsp_init_aarch64.o # ARMv8 optimizations @@ -27,6 +30,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o # NEON optimizations # subsystems +NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o @@ -36,14 +40,19 @@ NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ aarch64/hpeldsp_neon.o NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o -NEON-OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_neon.o +NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \ + aarch64/simple_idct_neon.o NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o +NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o # decoders/encoders -NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \ - aarch64/synth_filter_neon.o +NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o +NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o -NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_neon.o \ +NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ + aarch64/vp9itxfm_neon.o \ + aarch64/vp9lpf_16bpp_neon.o \ aarch64/vp9lpf_neon.o \ + aarch64/vp9mc_16bpp_neon.o \ aarch64/vp9mc_neon.o diff --git a/libavcodec/aarch64/aacpsdsp_init_aarch64.c b/libavcodec/aarch64/aacpsdsp_init_aarch64.c new file mode 100644 index 0000000000..5e7e19bba4 --- /dev/null +++ b/libavcodec/aarch64/aacpsdsp_init_aarch64.c @@ -0,0 +1,48 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/aacpsdsp.h" + +void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n); +void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2], + float *src1, int n); +void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2], + const float (*filter)[8][2], + ptrdiff_t stride, int n); +void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2], + float h[2][4], float h_step[2][4], + int len); +void ff_ps_stereo_interpolate_ipdopd_neon(float (*l)[2], float (*r)[2], + float h[2][4], float h_step[2][4], + int len); + +av_cold void ff_psdsp_init_aarch64(PSDSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + s->add_squares = ff_ps_add_squares_neon; + s->mul_pair_single = ff_ps_mul_pair_single_neon; + s->hybrid_analysis = ff_ps_hybrid_analysis_neon; + s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon; + s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_neon; + } +} diff --git a/libavcodec/aarch64/aacpsdsp_neon.S b/libavcodec/aarch64/aacpsdsp_neon.S new file mode 100644 index 0000000000..ff4e6e244a --- /dev/null +++ b/libavcodec/aarch64/aacpsdsp_neon.S @@ -0,0 +1,148 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +function ff_ps_add_squares_neon, export=1 +1: ld1 {v0.4S,v1.4S}, [x1], #32 + fmul v0.4S, v0.4S, v0.4S + fmul v1.4S, v1.4S, v1.4S + faddp v2.4S, v0.4S, v1.4S + ld1 {v3.4S}, [x0] + fadd v3.4S, v3.4S, v2.4S + st1 {v3.4S}, [x0], #16 + subs w2, w2, #4 + b.gt 1b + ret +endfunc + +function ff_ps_mul_pair_single_neon, export=1 +1: ld1 {v0.4S,v1.4S}, [x1], #32 + ld1 {v2.4S}, [x2], #16 + zip1 v3.4S, v2.4S, v2.4S + zip2 v4.4S, v2.4S, v2.4S + fmul v0.4S, v0.4S, v3.4S + fmul v1.4S, v1.4S, v4.4S + st1 {v0.4S,v1.4S}, [x0], #32 + subs w3, w3, #4 + b.gt 1b + ret +endfunc + +function ff_ps_stereo_interpolate_neon, export=1 + ld1 {v0.4S}, [x2] + ld1 {v1.4S}, [x3] + zip1 v4.4S, v0.4S, v0.4S + zip2 v5.4S, v0.4S, v0.4S + zip1 v6.4S, v1.4S, v1.4S + zip2 v7.4S, v1.4S, v1.4S +1: ld1 {v2.2S}, [x0] + ld1 {v3.2S}, [x1] + fadd v4.4S, v4.4S, v6.4S + fadd v5.4S, v5.4S, v7.4S + mov v2.D[1], v2.D[0] + mov v3.D[1], v3.D[0] + fmul v2.4S, v2.4S, v4.4S + fmla v2.4S, v3.4S, v5.4S + st1 {v2.D}[0], [x0], #8 + st1 {v2.D}[1], [x1], #8 + subs w4, w4, #1 + b.gt 1b + ret +endfunc + +function ff_ps_stereo_interpolate_ipdopd_neon, export=1 + ld1 {v0.4S,v1.4S}, [x2] + ld1 {v6.4S,v7.4S}, [x3] + fneg v2.4S, v1.4S + fneg v3.4S, v7.4S + zip1 v16.4S, v0.4S, v0.4S + zip2 v17.4S, v0.4S, v0.4S + zip1 v18.4S, v2.4S, v1.4S + zip2 v19.4S, v2.4S, v1.4S + zip1 v20.4S, v6.4S, v6.4S + zip2 v21.4S, v6.4S, v6.4S + zip1 v22.4S, v3.4S, v7.4S + zip2 v23.4S, v3.4S, v7.4S +1: ld1 {v2.2S}, [x0] + ld1 {v3.2S}, [x1] + fadd v16.4S, v16.4S, v20.4S + fadd v17.4S, v17.4S, v21.4S + mov v2.D[1], v2.D[0] + mov v3.D[1], v3.D[0] + fmul v4.4S, v2.4S, v16.4S + fmla v4.4S, v3.4S, v17.4S + fadd v18.4S, v18.4S, v22.4S + fadd v19.4S, v19.4S, v23.4S + ext v2.16B, v2.16B, v2.16B, #4 + ext v3.16B, v3.16B, v3.16B, #4 + fmla v4.4S, v2.4S, v18.4S + fmla v4.4S, v3.4S, v19.4S + st1 {v4.D}[0], [x0], #8 + st1 {v4.D}[1], [x1], #8 + subs w4, w4, #1 + b.gt 1b + ret +endfunc + +function ff_ps_hybrid_analysis_neon, export=1 + lsl x3, x3, #3 + ld2 {v0.4S,v1.4S}, [x1], #32 + ld2 {v2.2S,v3.2S}, [x1], #16 + ld1 {v24.2S}, [x1], #8 + ld2 {v4.2S,v5.2S}, [x1], #16 + ld2 {v6.4S,v7.4S}, [x1] + rev64 v6.4S, v6.4S + rev64 v7.4S, v7.4S + ext v6.16B, v6.16B, v6.16B, #8 + ext v7.16B, v7.16B, v7.16B, #8 + rev64 v4.2S, v4.2S + rev64 v5.2S, v5.2S + mov v2.D[1], v3.D[0] + mov v4.D[1], v5.D[0] + mov v5.D[1], v2.D[0] + mov v3.D[1], v4.D[0] + fadd v16.4S, v0.4S, v6.4S + fadd v17.4S, v1.4S, v7.4S + fsub v18.4S, v1.4S, v7.4S + fsub v19.4S, v0.4S, v6.4S + fadd v22.4S, v2.4S, v4.4S + fsub v23.4S, v5.4S, v3.4S + trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5} + trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7} +1: ld2 {v2.4S,v3.4S}, [x2], #32 + ld2 {v4.2S,v5.2S}, [x2], #16 + ld1 {v6.2S}, [x2], #8 + add x2, x2, #8 + mov v4.D[1], v5.D[0] + mov v6.S[1], v6.S[0] + fmul v6.2S, v6.2S, v24.2S + fmul v0.4S, v2.4S, v16.4S + fmul v1.4S, v2.4S, v17.4S + fmls v0.4S, v3.4S, v18.4S + fmla v1.4S, v3.4S, v19.4S + fmla v0.4S, v4.4S, v20.4S + fmla v1.4S, v4.4S, v21.4S + faddp v0.4S, v0.4S, v1.4S + faddp v0.4S, v0.4S, v0.4S + fadd v0.2S, v0.2S, v6.2S + st1 {v0.2S}, [x0], x3 + subs w4, w4, #1 + b.gt 1b + ret +endfunc diff --git a/libavcodec/aarch64/asm-offsets.h b/libavcodec/aarch64/asm-offsets.h index 60e32ddd1d..e05c5ad2e4 100644 --- a/libavcodec/aarch64/asm-offsets.h +++ b/libavcodec/aarch64/asm-offsets.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/cabac.h b/libavcodec/aarch64/cabac.h index e12953e86c..6b9b77eb30 100644 --- a/libavcodec/aarch64/cabac.h +++ b/libavcodec/aarch64/cabac.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/dcadsp_neon.S b/libavcodec/aarch64/dcadsp_neon.S deleted file mode 100644 index 4cd3328042..0000000000 --- a/libavcodec/aarch64/dcadsp_neon.S +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> - * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/aarch64/asm.S" - -function ff_dca_lfe_fir0_neon, export=1 - mov x3, #32 // decifactor - sub x1, x1, #7*4 - add x4, x0, #2*32*4 - 16 // out2 - mov x7, #-16 - - ld1 {v0.4s,v1.4s}, [x1] - // reverse [-num_coeffs + 1, 0] - ext v3.16b, v0.16b, v0.16b, #8 - ext v2.16b, v1.16b, v1.16b, #8 - rev64 v3.4s, v3.4s - rev64 v2.4s, v2.4s -1: - ld1 {v4.4s,v5.4s}, [x2], #32 - ld1 {v6.4s,v7.4s}, [x2], #32 - subs x3, x3, #4 - fmul v16.4s, v2.4s, v4.4s - fmul v23.4s, v0.4s, v4.4s - fmul v17.4s, v2.4s, v6.4s - fmul v22.4s, v0.4s, v6.4s - - fmla v16.4s, v3.4s, v5.4s - fmla v23.4s, v1.4s, v5.4s - ld1 {v4.4s,v5.4s}, [x2], #32 - fmla v17.4s, v3.4s, v7.4s - fmla v22.4s, v1.4s, v7.4s - ld1 {v6.4s,v7.4s}, [x2], #32 - fmul v18.4s, v2.4s, v4.4s - fmul v21.4s, v0.4s, v4.4s - fmul v19.4s, v2.4s, v6.4s - fmul v20.4s, v0.4s, v6.4s - - fmla v18.4s, v3.4s, v5.4s - fmla v21.4s, v1.4s, v5.4s - fmla v19.4s, v3.4s, v7.4s - fmla v20.4s, v1.4s, v7.4s - - faddp v16.4s, v16.4s, v17.4s - faddp v18.4s, v18.4s, v19.4s - faddp v20.4s, v20.4s, v21.4s - faddp v22.4s, v22.4s, v23.4s - faddp v16.4s, v16.4s, v18.4s - faddp v20.4s, v20.4s, v22.4s - - st1 {v16.4s}, [x0], #16 - st1 {v20.4s}, [x4], x7 - b.gt 1b - - ret -endfunc - -function ff_dca_lfe_fir1_neon, export=1 - mov x3, #64 // decifactor - sub x1, x1, #3*4 - add x4, x0, #2*64*4 - 16 // out2 - mov x7, #-16 - - ld1 {v0.4s}, [x1] - // reverse [-num_coeffs + 1, 0] - ext v1.16b, v0.16b, v0.16b, #8 - rev64 v1.4s, v1.4s - -1: - ld1 {v4.4s,v5.4s}, [x2], #32 - ld1 {v6.4s,v7.4s}, [x2], #32 - subs x3, x3, #4 - fmul v16.4s, v1.4s, v4.4s - fmul v23.4s, v0.4s, v4.4s - fmul v17.4s, v1.4s, v5.4s - fmul v22.4s, v0.4s, v5.4s - fmul v18.4s, v1.4s, v6.4s - fmul v21.4s, v0.4s, v6.4s - fmul v19.4s, v1.4s, v7.4s - fmul v20.4s, v0.4s, v7.4s - faddp v16.4s, v16.4s, v17.4s - faddp v18.4s, v18.4s, v19.4s - faddp v20.4s, v20.4s, v21.4s - faddp v22.4s, v22.4s, v23.4s - faddp v16.4s, v16.4s, v18.4s - faddp v20.4s, v20.4s, v22.4s - st1 {v16.4s}, [x0], #16 - st1 {v20.4s}, [x4], x7 - b.gt 1b - - ret -endfunc diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c index 9cc57d331e..db285205ab 100644 --- a/libavcodec/aarch64/fft_init_aarch64.c +++ b/libavcodec/aarch64/fft_init_aarch64.c @@ -1,23 +1,25 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "config.h" + #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/aarch64/cpu.h" @@ -27,6 +29,10 @@ void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); + av_cold void ff_fft_init_aarch64(FFTContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -34,5 +40,11 @@ av_cold void ff_fft_init_aarch64(FFTContext *s) if (have_neon(cpu_flags)) { s->fft_permute = ff_fft_permute_neon; s->fft_calc = ff_fft_calc_neon; +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; +#endif } } diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S index e205e23d88..862039f97d 100644 --- a/libavcodec/aarch64/fft_neon.S +++ b/libavcodec/aarch64/fft_neon.S @@ -8,20 +8,20 @@ * This algorithm (though not any of the implementation details) is * based on libdjbfft by D. J. Bernstein. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/fmtconvert_init.c b/libavcodec/aarch64/fmtconvert_init.c index 0a55a1b88c..210e74b654 100644 --- a/libavcodec/aarch64/fmtconvert_init.c +++ b/libavcodec/aarch64/fmtconvert_init.c @@ -1,20 +1,20 @@ /* * ARM optimized Format Conversion Utils * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/fmtconvert_neon.S b/libavcodec/aarch64/fmtconvert_neon.S index 3b33c87ade..2161c3a8ae 100644 --- a/libavcodec/aarch64/fmtconvert_neon.S +++ b/libavcodec/aarch64/fmtconvert_neon.S @@ -3,20 +3,20 @@ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/h264chroma_init_aarch64.c b/libavcodec/aarch64/h264chroma_init_aarch64.c index a373291344..fa6e0eaf15 100644 --- a/libavcodec/aarch64/h264chroma_init_aarch64.c +++ b/libavcodec/aarch64/h264chroma_init_aarch64.c @@ -2,20 +2,20 @@ * ARM NEON optimised H.264 chroma functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S index edc256cbc3..8be7578001 100644 --- a/libavcodec/aarch64/h264cmc_neon.S +++ b/libavcodec/aarch64/h264cmc_neon.S @@ -2,20 +2,20 @@ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -442,7 +442,7 @@ endconst h264_chroma_mc4 avg, rv40 #endif -#if CONFIG_VC1_DECODER +#if CONFIG_VC1DSP h264_chroma_mc8 put, vc1 h264_chroma_mc8 avg, vc1 h264_chroma_mc4 put, vc1 diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index 07bda2ff07..10cf199333 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -44,20 +44,20 @@ void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride, void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride, int alpha, int beta); -void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, +void ff_weight_h264_pixels_16_neon(uint8_t *dst, ptrdiff_t stride, int height, int log2_den, int weight, int offset); -void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height, +void ff_weight_h264_pixels_8_neon(uint8_t *dst, ptrdiff_t stride, int height, int log2_den, int weight, int offset); -void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height, +void ff_weight_h264_pixels_4_neon(uint8_t *dst, ptrdiff_t stride, int height, int log2_den, int weight, int offset); -void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride, +void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_den, int weightd, int weights, int offset); -void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride, +void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_den, int weightd, int weights, int offset); -void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride, +void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_den, int weightd, int weights, int offset); @@ -91,6 +91,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; + if (chroma_format_idc <= 1) c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 448e575b8c..80ac09d2be 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -3,20 +3,20 @@ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S index 1c43c1f301..7de44205d3 100644 --- a/libavcodec/aarch64/h264idct_neon.S +++ b/libavcodec/aarch64/h264idct_neon.S @@ -2,20 +2,20 @@ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -23,6 +23,7 @@ #include "neon.S" function ff_h264_idct_add_neon, export=1 +.L_ff_h264_idct_add_neon: ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] sxtw x2, w2 movi v30.8H, #0 @@ -77,6 +78,7 @@ function ff_h264_idct_add_neon, export=1 endfunc function ff_h264_idct_dc_add_neon, export=1 +.L_ff_h264_idct_dc_add_neon: sxtw x2, w2 mov w3, #0 ld1r {v2.8H}, [x1] @@ -106,8 +108,8 @@ function ff_h264_idct_add16_neon, export=1 mov w9, w3 // stride movrel x7, scan8 mov x10, #16 - movrel x13, X(ff_h264_idct_dc_add_neon) - movrel x14, X(ff_h264_idct_add_neon) + movrel x13, .L_ff_h264_idct_dc_add_neon + movrel x14, .L_ff_h264_idct_add_neon 1: mov w2, w9 ldrb w3, [x7], #1 ldrsw x0, [x5], #4 @@ -133,8 +135,8 @@ function ff_h264_idct_add16intra_neon, export=1 mov w9, w3 // stride movrel x7, scan8 mov x10, #16 - movrel x13, X(ff_h264_idct_dc_add_neon) - movrel x14, X(ff_h264_idct_add_neon) + movrel x13, .L_ff_h264_idct_dc_add_neon + movrel x14, .L_ff_h264_idct_add_neon 1: mov w2, w9 ldrb w3, [x7], #1 ldrsw x0, [x5], #4 @@ -160,8 +162,8 @@ function ff_h264_idct_add8_neon, export=1 add x5, x1, #16*4 // block_offset add x9, x2, #16*32 // block mov w19, w3 // stride - movrel x13, X(ff_h264_idct_dc_add_neon) - movrel x14, X(ff_h264_idct_add_neon) + movrel x13, .L_ff_h264_idct_dc_add_neon + movrel x14, .L_ff_h264_idct_add_neon movrel x7, scan8, 16 mov x10, #0 mov x11, #16 @@ -263,6 +265,7 @@ endfunc .endm function ff_h264_idct8_add_neon, export=1 +.L_ff_h264_idct8_add_neon: movi v19.8H, #0 sxtw x2, w2 ld1 {v24.8H, v25.8H}, [x1] @@ -326,6 +329,7 @@ function ff_h264_idct8_add_neon, export=1 endfunc function ff_h264_idct8_dc_add_neon, export=1 +.L_ff_h264_idct8_dc_add_neon: mov w3, #0 sxtw x2, w2 ld1r {v31.8H}, [x1] @@ -375,8 +379,8 @@ function ff_h264_idct8_add4_neon, export=1 mov w2, w3 movrel x7, scan8 mov w10, #16 - movrel x13, X(ff_h264_idct8_dc_add_neon) - movrel x14, X(ff_h264_idct8_add_neon) + movrel x13, .L_ff_h264_idct8_dc_add_neon + movrel x14, .L_ff_h264_idct8_add_neon 1: ldrb w9, [x7], #4 ldrsw x0, [x5], #16 ldrb w9, [x4, w9, UXTW] diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c index 8f912cbca9..b144376f90 100644 --- a/libavcodec/aarch64/h264pred_init.c +++ b/libavcodec/aarch64/h264pred_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S index a38a27f186..213b40b3e7 100644 --- a/libavcodec/aarch64/h264pred_neon.S +++ b/libavcodec/aarch64/h264pred_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c index 74088b216c..77f41d9a21 100644 --- a/libavcodec/aarch64/h264qpel_init_aarch64.c +++ b/libavcodec/aarch64/h264qpel_init_aarch64.c @@ -2,20 +2,20 @@ * ARM NEON optimised DSP functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S index 731dc0658d..d27cfac494 100644 --- a/libavcodec/aarch64/h264qpel_neon.S +++ b/libavcodec/aarch64/h264qpel_neon.S @@ -2,20 +2,20 @@ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/hpeldsp_init_aarch64.c b/libavcodec/aarch64/hpeldsp_init_aarch64.c index 6bc4c09f6c..144ae2bcc4 100644 --- a/libavcodec/aarch64/hpeldsp_init_aarch64.c +++ b/libavcodec/aarch64/hpeldsp_init_aarch64.c @@ -2,20 +2,20 @@ * ARM NEON optimised DSP functions * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S index 29782908f8..a491c173bb 100644 --- a/libavcodec/aarch64/hpeldsp_neon.S +++ b/libavcodec/aarch64/hpeldsp_neon.S @@ -3,20 +3,20 @@ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/idct.h b/libavcodec/aarch64/idct.h new file mode 100644 index 0000000000..5c49046148 --- /dev/null +++ b/libavcodec/aarch64/idct.h @@ -0,0 +1,28 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AARCH64_IDCT_H +#define AVCODEC_AARCH64_IDCT_H + +#include <stdint.h> + +void ff_simple_idct_neon(int16_t *data); +void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data); +void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data); + +#endif /* AVCODEC_AARCH64_IDCT_H */ diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c new file mode 100644 index 0000000000..0406e60830 --- /dev/null +++ b/libavcodec/aarch64/idctdsp_init_aarch64.c @@ -0,0 +1,41 @@ +/* + * ARM-NEON-optimized IDCT functions + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idct.h" + +av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if (!avctx->lowres && !high_bit_depth) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLENEON) { + c->idct_put = ff_simple_idct_put_neon; + c->idct_add = ff_simple_idct_add_neon; + c->idct = ff_simple_idct_neon; + c->perm_type = FF_IDCT_PERM_PARTTRANS; + } + } +} diff --git a/libavcodec/aarch64/imdct15_init.c b/libavcodec/aarch64/imdct15_init.c deleted file mode 100644 index 38018f2b4a..0000000000 --- a/libavcodec/aarch64/imdct15_init.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stddef.h> - -#include "libavutil/cpu.h" -#include "libavutil/aarch64/cpu.h" -#include "libavutil/internal.h" - -#include "libavcodec/imdct15.h" - -#include "asm-offsets.h" - -AV_CHECK_OFFSET(IMDCT15Context, exptab, CELT_EXPTAB); -AV_CHECK_OFFSET(IMDCT15Context, fft_n, CELT_FFT_N); -AV_CHECK_OFFSET(IMDCT15Context, len2, CELT_LEN2); -AV_CHECK_OFFSET(IMDCT15Context, len4, CELT_LEN4); -AV_CHECK_OFFSET(IMDCT15Context, tmp, CELT_TMP); -AV_CHECK_OFFSET(IMDCT15Context, twiddle_exptab, CELT_TWIDDLE); - -void ff_celt_imdct_half_neon(IMDCT15Context *s, float *dst, const float *src, - ptrdiff_t stride, float scale); - -void ff_imdct15_init_aarch64(IMDCT15Context *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->imdct_half = ff_celt_imdct_half_neon; - } -} diff --git a/libavcodec/aarch64/imdct15_neon.S b/libavcodec/aarch64/imdct15_neon.S deleted file mode 100644 index d99edf4108..0000000000 --- a/libavcodec/aarch64/imdct15_neon.S +++ /dev/null @@ -1,647 +0,0 @@ -/* - * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/aarch64/asm.S" - -#include "asm-offsets.h" - -.macro shuffle a, b, c, d -const shuffle_\a\b\c\d, align=4 - .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3) - .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3) - .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3) - .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3) -endconst -.endm - -shuffle 0, 2, 1, 3 -shuffle 1, 0, 3, 2 -shuffle 2, 3, 0, 1 -shuffle 3, 1, 2, 0 - - -function fft5_neon - lsl x2, x2, #3 - ld1 {v24.2s}, [x1], x2 - ld2 {v25.s,v26.s}[0], [x1], x2 - ld2 {v25.s,v26.s}[1], [x1], x2 - ld2 {v25.s,v26.s}[2], [x1], x2 - ld2 {v25.s,v26.s}[3], [x1] - dup v6.4s, v24.s[0] - dup v7.4s, v24.s[1] - - faddp v0.4s, v25.4s, v26.4s - // z[][0], z[][3] - fmul v16.4s, v25.4s, v15.s[0] // rr - fmul v17.4s, v25.4s, v15.s[1] // ri - fmul v18.4s, v26.4s, v15.s[0] // ir - fmul v19.4s, v26.4s, v15.s[1] // ii - faddp v0.4s, v0.4s, v0.4s - // z[][1], z[][2] - fmul v20.4s, v25.4s, v15.s[2] // rr - fmul v21.4s, v25.4s, v15.s[3] // ri - fmul v22.4s, v26.4s, v15.s[2] // ir - fmul v23.4s, v26.4s, v15.s[3] // ii - fadd v0.2s, v24.2s, v0.2s // out[0] - - // z[0123][0], z[0123][3] - fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii; - fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii; - ld1 {v16.16b}, [x11] - ld1 {v19.16b}, [x14] - fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir; - fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir; - ld1 {v17.16b}, [x12] - // z[0123][1], z[0123][2] - fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii; - fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii; - ld1 {v18.16b}, [x13] - fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir; - fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir; - - //real - tbl v20.16b, {v24.16b}, v16.16b - tbl v21.16b, {v25.16b}, v17.16b - tbl v22.16b, {v26.16b}, v18.16b - tbl v23.16b, {v27.16b}, v19.16b - //imag - tbl v16.16b, {v28.16b}, v16.16b - tbl v17.16b, {v29.16b}, v17.16b - tbl v18.16b, {v30.16b}, v18.16b - tbl v19.16b, {v31.16b}, v19.16b - - fadd v6.4s, v6.4s, v20.4s - fadd v22.4s, v22.4s, v23.4s - fadd v7.4s, v7.4s, v16.4s - fadd v18.4s, v18.4s, v19.4s - - fadd v21.4s, v21.4s, v22.4s - fadd v17.4s, v17.4s, v18.4s - fadd v6.4s, v6.4s, v21.4s - fadd v7.4s, v7.4s, v17.4s - - ret -endfunc - -function fft15_neon - mov x8, x1 - mov x9, x30 - add x2, x3, x3, lsl #1 // 3 * stride - - add x1, x8, x3, lsl #3 // in + 1 * stride - bl fft5_neon - mov v1.8b, v0.8b - mov v2.16b, v6.16b - mov v3.16b, v7.16b - - add x1, x8, x3, lsl #4 // in + 2 * stride - add x2, x3, x3, lsl #1 // 3 * stride - bl fft5_neon - zip1 v1.4s, v1.4s, v0.4s - mov v4.16b, v6.16b - mov v5.16b, v7.16b - - mov x1, x8 // in + 0 * stride - add x2, x3, x3, lsl #1 // 3 * stride - bl fft5_neon - - faddp v20.4s, v1.4s, v1.4s - - ext v18.16b, v8.16b, v8.16b, #4 - ext v19.16b, v9.16b, v9.16b, #4 - mov v16.16b, v6.16b - mov v17.16b, v7.16b - fadd v20.2s, v20.2s, v0.2s - - uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re - uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im - - st1 {v20.2s}, [x0], #8 // out[0] - - fmla v16.4s, v2.4s, v8.4s - fmls v16.4s, v3.4s, v9.4s - - fmla v17.4s, v2.4s, v9.4s - fmla v17.4s, v3.4s, v8.4s - - fmla v16.4s, v4.4s, v18.4s - fmls v16.4s, v5.4s, v19.4s - - fmla v17.4s, v4.4s, v19.4s - fmla v17.4s, v5.4s, v18.4s - - zip1 v18.4s, v16.4s, v17.4s - zip2 v19.4s, v16.4s, v17.4s - - rev64 v31.4s, v14.4s - trn1 v28.2d, v1.2d, v1.2d - trn2 v29.2d, v1.2d, v1.2d - zip1 v30.2d, v14.2d, v31.2d - zip2 v31.2d, v14.2d, v31.2d - - st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4] - - fmul v16.4s, v28.4s, v30.4s - fmul v17.4s, v29.4s, v30.4s - fmls v16.4s, v29.4s, v31.4s - fmla v17.4s, v28.4s, v31.4s - faddp v16.4s, v16.4s, v16.4s - faddp v17.4s, v17.4s, v17.4s - zip1 v18.2s, v16.2s, v17.2s - zip2 v19.2s, v16.2s, v17.2s - - fadd v18.2s, v18.2s, v0.2s - fadd v0.2s, v19.2s, v0.2s - - ext v30.16b, v12.16b, v12.16b, #4 - ext v31.16b, v13.16b, v13.16b, #4 - mov v16.16b, v6.16b - mov v17.16b, v7.16b - - uzp1 v30.4s, v30.4s, v8.4s - uzp1 v31.4s, v31.4s, v9.4s - - st1 {v18.2s}, [x0], #8 // out[5] - - fmla v16.4s, v2.4s, v10.4s - fmls v16.4s, v3.4s, v11.4s - - fmla v17.4s, v2.4s, v11.4s - fmla v17.4s, v3.4s, v10.4s - - fmla v16.4s, v4.4s, v30.4s - fmls v16.4s, v5.4s, v31.4s - - fmla v17.4s, v4.4s, v31.4s - fmla v17.4s, v5.4s, v30.4s - - zip1 v18.4s, v16.4s, v17.4s - zip2 v19.4s, v16.4s, v17.4s - - ext v30.16b, v10.16b, v10.16b, #4 - ext v31.16b, v11.16b, v11.16b, #4 - - fmla v6.4s, v2.4s, v12.4s - fmls v6.4s, v3.4s, v13.4s - - st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9] - - uzp1 v30.4s, v30.4s, v12.4s - uzp1 v31.4s, v31.4s, v13.4s - - fmla v7.4s, v2.4s, v13.4s - fmla v7.4s, v3.4s, v12.4s - - st1 {v0.2s}, [x0], #8 // out[10] - - fmla v6.4s, v4.4s, v30.4s - fmls v6.4s, v5.4s, v31.4s - - fmla v7.4s, v4.4s, v31.4s - fmla v7.4s, v5.4s, v30.4s - - zip1 v18.4s, v6.4s, v7.4s - zip2 v19.4s, v6.4s, v7.4s - - st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14] - - ret x9 -endfunc - -// x0: out, x1: out+len2, x2: exptab, x3: len2 -function fft15_pass - ands x6, x3, #3 - mov x4, x0 - mov x5, x1 - b.eq 9f - ld1 {v0.2s}, [x0], #8 - ld1 {v1.2s}, [x1], #8 - sub x3, x3, x6 - subs x6, x6, #1 - fadd v2.2s, v0.2s, v1.2s - fsub v3.2s, v0.2s, v1.2s - add x2, x2, #8 - st1 {v2.2s}, [x4], #8 - st1 {v3.2s}, [x5], #8 - b.eq 9f -1: - subs x6, x6, #1 - ldp s4, s5, [x2], #8 - ldp s2, s3, [x1], #8 - ldp s0, s1, [x0], #8 - - fmul s6, s2, s4 - fmul s7, s2, s5 - fmls s6, s3, v5.s[0] - fmla s7, s3, v4.s[0] - - fsub s2, s0, s6 - fsub s3, s1, s7 - fadd s0, s0, s6 - fadd s1, s1, s7 - - stp s2, s3, [x5], #8 - stp s0, s1, [x4], #8 - b.gt 1b -9: - ld1 {v4.4s,v5.4s}, [x2], #32 - ld2 {v2.4s,v3.4s}, [x1], #32 - uzp1 v6.4s, v4.4s, v5.4s - uzp2 v7.4s, v4.4s, v5.4s - ld2 {v0.4s,v1.4s}, [x0], #32 -8: - subs x3, x3, #8 - - fmul v4.4s, v2.4s, v6.4s - fmul v5.4s, v2.4s, v7.4s - b.lt 4f - - ld1 {v18.4s,v19.4s}, [x2], #32 - - fmls v4.4s, v3.4s, v7.4s - fmla v5.4s, v3.4s, v6.4s - - ld2 {v22.4s,v23.4s}, [x1], #32 - - fsub v2.4s, v0.4s, v4.4s - fadd v0.4s, v0.4s, v4.4s - fsub v3.4s, v1.4s, v5.4s - fadd v1.4s, v1.4s, v5.4s - - uzp1 v16.4s, v18.4s, v19.4s - uzp2 v17.4s, v18.4s, v19.4s - - st2 {v2.4s,v3.4s}, [x5], #32 - st2 {v0.4s,v1.4s}, [x4], #32 - ld2 {v20.4s,v21.4s}, [x0], #32 - - fmul v18.4s, v22.4s, v16.4s - fmul v19.4s, v22.4s, v17.4s - b.eq 0f - - ld1 {v4.4s,v5.4s}, [x2], #32 - - fmls v18.4s, v23.4s, v17.4s - fmla v19.4s, v23.4s, v16.4s - - ld2 {v2.4s,v3.4s}, [x1], #32 - - fsub v22.4s, v20.4s, v18.4s - fadd v20.4s, v20.4s, v18.4s - fsub v23.4s, v21.4s, v19.4s - fadd v21.4s, v21.4s, v19.4s - - uzp1 v6.4s, v4.4s, v5.4s - uzp2 v7.4s, v4.4s, v5.4s - - st2 {v22.4s,v23.4s}, [x5], #32 - st2 {v20.4s,v21.4s}, [x4], #32 - ld2 {v0.4s,v1.4s}, [x0], #32 - - b 8b -4: - fmls v4.4s, v3.4s, v7.4s - fmla v5.4s, v3.4s, v6.4s - - fsub v2.4s, v0.4s, v4.4s - fadd v0.4s, v0.4s, v4.4s - fsub v3.4s, v1.4s, v5.4s - fadd v1.4s, v1.4s, v5.4s - - st2 {v2.4s,v3.4s}, [x5], #32 - st2 {v0.4s,v1.4s}, [x4], #32 - - ret -0: - fmls v18.4s, v23.4s, v17.4s - fmla v19.4s, v23.4s, v16.4s - - fsub v22.4s, v20.4s, v18.4s - fadd v20.4s, v20.4s, v18.4s - fsub v23.4s, v21.4s, v19.4s - fadd v21.4s, v21.4s, v19.4s - - st2 {v22.4s,v23.4s}, [x5], #32 - st2 {v20.4s,v21.4s}, [x4], #32 - - ret -endfunc - -function fft30_neon, align=6 - sub sp, sp, #0x20 - stp x20, x21, [sp] - stp x22, x30, [sp, #0x10] - mov x21, x1 - mov x22, x2 - mov x20, x4 - mov x0, x21 - mov x1, x22 - lsl x3, x20, #1 - bl fft15_neon - - add x0, x21, #15*8 - add x1, x22, x20, lsl #3 - lsl x3, x20, #1 - bl fft15_neon - - ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1] - add x0, x21, #0 - add x1, x21, #15*8 - mov x3, #15 - ldp x20, x21, [sp] - ldp x22, x30, [sp, #0x10] - add sp, sp, #0x20 - b fft15_pass -endfunc - -.macro def_fft n, n2 -function fft\n\()_neon, align=6 - sub sp, sp, #0x30 - stp x20, x21, [sp] - stp x22, x30, [sp, #0x10] - stp x23, x24, [sp, #0x20] - mov x21, x1 - mov x22, x2 - mov x23, x3 - mov x20, x4 - sub x3, x3, #1 - lsl x4, x4, #1 - bl fft\n2\()_neon - - add x1, x21, #(\n2 * 8) - add x2, x22, x20, lsl #3 - sub x3, x23, #1 - lsl x4, x20, #1 - bl fft\n2\()_neon - - add x5, x10, #CELT_EXPTAB - mov x0, x21 - ldr x2, [x5, x23, lsl #3] // s->exptab[N] - add x1, x21, #(\n2 * 8) - mov x3, #\n2 - ldp x20, x21, [sp] - ldp x22, x30, [sp, #0x10] - ldp x23, x24, [sp, #0x20] - add sp, sp, #0x30 - b fft15_pass -endfunc -.endm - - def_fft 60, 30 - def_fft 120, 60 - def_fft 240, 120 - def_fft 480, 240 - def_fft 960, 480 - -function fft_b15_calc_neon - sub sp, sp, #0x50 - ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0] - movrel x6, fact5 - movrel x11, shuffle_0213 - movrel x12, shuffle_1032 - movrel x13, shuffle_2301 - movrel x14, shuffle_3120 - add x8, x8, #8 - movrel x5, fft_tab_neon - stp x20, x30, [sp] - stp d8, d9, [sp, #0x10] - stp d10, d11, [sp, #0x20] - stp d12, d13, [sp, #0x30] - stp d14, d15, [sp, #0x40] - ld1 {v15.4s}, [x6] - ld1 {v0.4s,v1.4s}, [x8], #32 - ld1 {v6.2s}, [x8], #8 - ld1 {v2.4s,v3.4s}, [x8], #32 - ld1 {v7.2s}, [x8], #8 - ld1 {v4.4s,v5.4s}, [x8], #32 - uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re - uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im - uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re - uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im - uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re - uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im - zip1 v14.4s, v6.4s, v7.4s // exp[5,10].re/exp[5,10].im - add x5, x5, x3, lsl #3 - ldr x5, [x5] - mov x10, x0 - blr x5 - ldp x20, x30, [sp] - ldp d8, d9, [sp, #0x10] - ldp d10, d11, [sp, #0x20] - ldp d12, d13, [sp, #0x30] - ldp d14, d15, [sp, #0x40] - add sp, sp, #0x50 - ret -endfunc - -const fft_tab_neon, relocate=1 - .quad fft15_neon - .quad fft30_neon - .quad fft60_neon - .quad fft120_neon - .quad fft240_neon - .quad fft480_neon - .quad fft960_neon -endconst - -function ff_celt_imdct_half_neon, export=1 - sub sp, sp, #0x20 - stp x21, x30, [sp] - str s0, [sp, #0x10] - - ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4 - mov x10, x0 - mov x21, x1 - sub w5, w5, #1 - lsl x7, x3, #3 // 2 * stride * sizeof(float) - sub x8, xzr, x3, lsl #3 // -2 * stride * sizeof(float) - mul x5, x5, x3 - ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE - ldr w3, [x0, #CELT_FFT_N] - add x5, x2, x5, lsl #2 - mov x11, x9 - - sub w6, w6, #4 - ld1 {v0.s}[0], [x5], x8 - ld1 {v1.s}[0], [x2], x7 - ld1 {v4.4s,v5.4s}, [x10], #32 - ld1 {v0.s}[1], [x5], x8 - ld1 {v1.s}[1], [x2], x7 - uzp1 v2.4s, v4.4s, v5.4s - ld1 {v0.s}[2], [x5], x8 - ld1 {v1.s}[2], [x2], x7 - uzp2 v3.4s, v4.4s, v5.4s - ld1 {v0.s}[3], [x5], x8 - ld1 {v1.s}[3], [x2], x7 -1: - subs w6, w6, #4 - - ld1 {v20.s}[0], [x5], x8 - ld1 {v21.s}[0], [x2], x7 - ld1 {v4.4s,v5.4s}, [x10], #32 - - fmul v6.4s, v0.4s, v2.4s - fmul v7.4s, v0.4s, v3.4s - - ld1 {v20.s}[1], [x5], x8 - ld1 {v21.s}[1], [x2], x7 - - fmls v6.4s, v1.4s, v3.4s - fmla v7.4s, v1.4s, v2.4s - - ld1 {v20.s}[2], [x5], x8 - ld1 {v21.s}[2], [x2], x7 - - uzp1 v2.4s, v4.4s, v5.4s - uzp2 v3.4s, v4.4s, v5.4s - ld1 {v20.s}[3], [x5], x8 - ld1 {v21.s}[3], [x2], x7 - - zip1 v4.4s, v6.4s, v7.4s - zip2 v5.4s, v6.4s, v7.4s - - fmul v6.4s, v20.4s, v2.4s - fmul v7.4s, v20.4s, v3.4s - - st1 {v4.4s,v5.4s}, [x9], #32 - - fmls v6.4s, v21.4s, v3.4s - fmla v7.4s, v21.4s, v2.4s - - b.eq 3f - - subs w6, w6, #4 - ld1 {v4.4s,v5.4s}, [x10], #32 - ld1 {v0.s}[0], [x5], x8 - ld1 {v1.s}[0], [x2], x7 - uzp1 v2.4s, v4.4s, v5.4s - ld1 {v0.s}[1], [x5], x8 - ld1 {v1.s}[1], [x2], x7 - uzp2 v3.4s, v4.4s, v5.4s - ld1 {v0.s}[2], [x5], x8 - ld1 {v1.s}[2], [x2], x7 - zip1 v4.4s, v6.4s, v7.4s - zip2 v5.4s, v6.4s, v7.4s - ld1 {v0.s}[3], [x5], x8 - ld1 {v1.s}[3], [x2], x7 - - st1 {v4.4s,v5.4s}, [x9], #32 - - b.gt 1b - - fmul v6.4s, v0.4s, v2.4s - fmul v7.4s, v0.4s, v3.4s - fmls v6.4s, v1.4s, v3.4s - fmla v7.4s, v1.4s, v2.4s -3: - zip1 v4.4s, v6.4s, v7.4s - zip2 v5.4s, v6.4s, v7.4s - st1 {v4.4s,v5.4s}, [x9], #32 - - mov x2, x11 - mov x4, #1 - - bl fft_b15_calc_neon - - ldr w5, [x10, #CELT_LEN4] - ldr x6, [x10, #CELT_TWIDDLE] - ldr s31, [sp, #0x10] - - add x1, x21, x5, lsl #2 - add x3, x6, x5, lsl #2 - sub x0, x1, #16 - sub x2, x3, #16 - mov x8, #-16 - mov x7, #16 - mov x10, x0 - mov x11, x1 - - sub w5, w5, #4 - - ld1 {v0.4s}, [x0], x8 - ld1 {v1.4s}, [x1], x7 - ld1 {v2.4s}, [x2], x8 - ld1 {v3.4s}, [x3], x7 - - uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].re - uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].im - - uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re - uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im - - fmul v1.4s, v6.4s, v5.4s - fmul v0.4s, v6.4s, v7.4s -2: - subs w5, w5, #4 - - ld1 {v20.4s}, [x0], x8 - - fmla v1.4s, v4.4s, v7.4s - fmls v0.4s, v4.4s, v5.4s - - ld1 {v21.4s}, [x1], x7 - - ext v1.16b, v1.16b, v1.16b, #8 - fmul v0.4s, v0.4s, v31.s[0] - - ld1 {v2.4s}, [x2], x8 - - rev64 v1.4s, v1.4s - fmul v1.4s, v1.4s, v31.s[0] - - ld1 {v3.4s}, [x3], x7 - - zip1 v5.4s, v0.4s, v1.4s - zip2 v7.4s, v0.4s, v1.4s - - uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].re - uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].im - - st1 {v5.4s}, [x10], x8 - st1 {v7.4s}, [x11], x7 - - uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re - uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im - - fmul v1.4s, v6.4s, v5.4s - fmul v0.4s, v6.4s, v7.4s - b.gt 2b - - fmla v1.4s, v4.4s, v7.4s - fmls v0.4s, v4.4s, v5.4s - ext v1.16b, v1.16b, v1.16b, #8 - fmul v0.4s, v0.4s, v31.s[0] - rev64 v1.4s, v1.4s - fmul v1.4s, v1.4s, v31.s[0] - zip1 v5.4s, v0.4s, v1.4s - zip2 v7.4s, v0.4s, v1.4s - st1 {v5.4s}, [x10], x8 - st1 {v7.4s}, [x11], x7 - - ldp x21, x30, [sp] - add sp, sp, #0x20 - ret -endfunc - -// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5) -const fact5, align=4 - .float 0.30901699437494745, 0.95105651629515353 - .float -0.80901699437494734, 0.58778525229247325 -endconst diff --git a/libavcodec/aarch64/mdct_init.c b/libavcodec/aarch64/mdct_init.c deleted file mode 100644 index 816111ab63..0000000000 --- a/libavcodec/aarch64/mdct_init.c +++ /dev/null @@ -1,39 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/aarch64/cpu.h" - -#include "libavcodec/fft.h" - -void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); - -av_cold void ff_mdct_init_aarch64(FFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->imdct_calc = ff_imdct_calc_neon; - s->imdct_half = ff_imdct_half_neon; - s->mdct_calc = ff_mdct_calc_neon; - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; - } -} diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S index bccd8323fd..1fd199c972 100644 --- a/libavcodec/aarch64/mdct_neon.S +++ b/libavcodec/aarch64/mdct_neon.S @@ -3,20 +3,20 @@ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/mpegaudiodsp_init.c b/libavcodec/aarch64/mpegaudiodsp_init.c index 849e310f62..5d966af5f4 100644 --- a/libavcodec/aarch64/mpegaudiodsp_init.c +++ b/libavcodec/aarch64/mpegaudiodsp_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S index 2a36f67603..b6ef131228 100644 --- a/libavcodec/aarch64/mpegaudiodsp_neon.S +++ b/libavcodec/aarch64/mpegaudiodsp_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S index 377009e244..0fddbecae3 100644 --- a/libavcodec/aarch64/neon.S +++ b/libavcodec/aarch64/neon.S @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/neontest.c b/libavcodec/aarch64/neontest.c index 201bfb1ce7..a24c22dd30 100644 --- a/libavcodec/aarch64/neontest.c +++ b/libavcodec/aarch64/neontest.c @@ -2,20 +2,20 @@ * check NEON registers for clobbers * Copyright (c) 2013 Martin Storsjo * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/rv40dsp_init_aarch64.c b/libavcodec/aarch64/rv40dsp_init_aarch64.c index f7fcd5b493..142705db98 100644 --- a/libavcodec/aarch64/rv40dsp_init_aarch64.c +++ b/libavcodec/aarch64/rv40dsp_init_aarch64.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/sbrdsp_init_aarch64.c b/libavcodec/aarch64/sbrdsp_init_aarch64.c new file mode 100644 index 0000000000..9c967990df --- /dev/null +++ b/libavcodec/aarch64/sbrdsp_init_aarch64.c @@ -0,0 +1,70 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/aarch64/cpu.h" +#include "libavutil/attributes.h" +#include "libavcodec/sbrdsp.h" + +void ff_sbr_sum64x5_neon(float *z); +float ff_sbr_sum_square_neon(float (*x)[2], int n); +void ff_sbr_neg_odd_64_neon(float *x); +void ff_sbr_qmf_pre_shuffle_neon(float *z); +void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z); +void ff_sbr_qmf_deint_neg_neon(float *v, const float *src); +void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1); +void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2], + const float *g_filt, int m_max, intptr_t ixh); +void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2], + const float alpha0[2], const float alpha1[2], + float bw, int start, int end); +void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]); +void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); + +av_cold void ff_sbrdsp_init_aarch64(SBRDSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + s->sum64x5 = ff_sbr_sum64x5_neon; + s->sum_square = ff_sbr_sum_square_neon; + s->neg_odd_64 = ff_sbr_neg_odd_64_neon; + s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon; + s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon; + s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon; + s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon; + s->hf_g_filt = ff_sbr_hf_g_filt_neon; + s->hf_gen = ff_sbr_hf_gen_neon; + s->autocorrelate = ff_sbr_autocorrelate_neon; + s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon; + s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon; + s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon; + s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon; + } +} diff --git a/libavcodec/aarch64/sbrdsp_neon.S b/libavcodec/aarch64/sbrdsp_neon.S new file mode 100644 index 0000000000..d23717e760 --- /dev/null +++ b/libavcodec/aarch64/sbrdsp_neon.S @@ -0,0 +1,327 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +const factors, align=4 + .float 1.0, -1.0, 1.0, -1.0 +endconst + +const phi_noise_0, align=4 + .float 1.0, 0.0, 1.0, 0.0 +endconst + +const phi_noise_1, align=4 + .float 0.0, 1.0, 0.0, -1.0 + .float 0.0, -1.0, 0.0, 1.0 +endconst + +const phi_noise_2, align=4 + .float -1.0, 0.0, -1.0, 0.0 +endconst + +const phi_noise_3, align=4 + .float 0.0, -1.0, 0.0, 1.0 + .float 0.0, 1.0, 0.0, -1.0 +endconst + +function ff_sbr_sum64x5_neon, export=1 + add x1, x0, #64*4 + add x2, x0, #128*4 + add x3, x0, #192*4 + add x4, x0, #256*4 + mov x5, #64 +1: ld1 {v0.4S}, [x0] + ld1 {v1.4S}, [x1], #16 + fadd v0.4S, v0.4S, v1.4S + ld1 {v2.4S}, [x2], #16 + fadd v0.4S, v0.4S, v2.4S + ld1 {v3.4S}, [x3], #16 + fadd v0.4S, v0.4S, v3.4S + ld1 {v4.4S}, [x4], #16 + fadd v0.4S, v0.4S, v4.4S + st1 {v0.4S}, [x0], #16 + subs x5, x5, #4 + b.gt 1b + ret +endfunc + +function ff_sbr_sum_square_neon, export=1 + movi v0.4S, #0 +1: ld1 {v1.4S}, [x0], #16 + fmla v0.4S, v1.4S, v1.4S + subs w1, w1, #2 + b.gt 1b + faddp v0.4S, v0.4S, v0.4S + faddp v0.4S, v0.4S, v0.4S + ret +endfunc + +function ff_sbr_neg_odd_64_neon, export=1 + mov x1, x0 + movi v5.4S, #1<<7, lsl #24 + ld2 {v0.4S, v1.4S}, [x0], #32 + eor v1.16B, v1.16B, v5.16B + ld2 {v2.4S, v3.4S}, [x0], #32 +.rept 3 + st2 {v0.4S, v1.4S}, [x1], #32 + eor v3.16B, v3.16B, v5.16B + ld2 {v0.4S, v1.4S}, [x0], #32 + st2 {v2.4S, v3.4S}, [x1], #32 + eor v1.16B, v1.16B, v5.16B + ld2 {v2.4S, v3.4S}, [x0], #32 +.endr + eor v3.16B, v3.16B, v5.16B + st2 {v0.4S, v1.4S}, [x1], #32 + st2 {v2.4S, v3.4S}, [x1], #32 + ret +endfunc + +function ff_sbr_qmf_pre_shuffle_neon, export=1 + add x1, x0, #60*4 + add x2, x0, #64*4 + mov x3, #-16 + mov x4, #-4 + movi v6.4S, #1<<7, lsl #24 + ld1 {v0.2S}, [x0], #8 + st1 {v0.2S}, [x2], #8 +.rept 7 + ld1 {v1.4S}, [x1], x3 + ld1 {v2.4S}, [x0], #16 + eor v1.16B, v1.16B, v6.16B + rev64 v1.4S, v1.4S + ext v1.16B, v1.16B, v1.16B, #8 + st2 {v1.4S, v2.4S}, [x2], #32 +.endr + add x1, x1, #8 + ld1 {v1.2S}, [x1], x4 + ld1 {v2.2S}, [x0], #8 + ld1 {v1.S}[3], [x1] + ld1 {v2.S}[2], [x0] + eor v1.16B, v1.16B, v6.16B + rev64 v1.4S, v1.4S + st2 {v1.2S, v2.2S}, [x2], #16 + st2 {v1.S, v2.S}[2], [x2] + ret +endfunc + +function ff_sbr_qmf_post_shuffle_neon, export=1 + add x2, x1, #60*4 + mov x3, #-16 + mov x4, #32 + movi v6.4S, #1<<7, lsl #24 +1: ld1 {v0.4S}, [x2], x3 + ld1 {v1.4S}, [x1], #16 + eor v0.16B, v0.16B, v6.16B + rev64 v0.4S, v0.4S + ext v0.16B, v0.16B, v0.16B, #8 + st2 {v0.4S, v1.4S}, [x0], #32 + subs x4, x4, #4 + b.gt 1b + ret +endfunc + +function ff_sbr_qmf_deint_neg_neon, export=1 + add x1, x1, #56*4 + add x2, x0, #60*4 + mov x3, #-32 + mov x4, #32 + movi v2.4S, #1<<7, lsl #24 +1: ld2 {v0.4S, v1.4S}, [x1], x3 + eor v0.16B, v0.16B, v2.16B + rev64 v1.4S, v1.4S + ext v1.16B, v1.16B, v1.16B, #8 + st1 {v0.4S}, [x2] + st1 {v1.4S}, [x0], #16 + sub x2, x2, #16 + subs x4, x4, #4 + b.gt 1b + ret +endfunc + +function ff_sbr_qmf_deint_bfly_neon, export=1 + add x2, x2, #60*4 + add x3, x0, #124*4 + mov x4, #64 + mov x5, #-16 +1: ld1 {v0.4S}, [x1], #16 + ld1 {v1.4S}, [x2], x5 + rev64 v2.4S, v0.4S + ext v2.16B, v2.16B, v2.16B, #8 + rev64 v3.4S, v1.4S + ext v3.16B, v3.16B, v3.16B, #8 + fadd v1.4S, v1.4S, v2.4S + fsub v0.4S, v0.4S, v3.4S + st1 {v0.4S}, [x0], #16 + st1 {v1.4S}, [x3], x5 + subs x4, x4, #4 + b.gt 1b + ret +endfunc + +function ff_sbr_hf_gen_neon, export=1 + sxtw x4, w4 + sxtw x5, w5 + movrel x6, factors + ld1 {v7.4S}, [x6] + dup v1.4S, v0.S[0] + mov v2.8B, v1.8B + mov v2.S[2], v7.S[0] + mov v2.S[3], v7.S[0] + fmul v1.4S, v1.4S, v2.4S + ld1 {v0.D}[0], [x3] + ld1 {v0.D}[1], [x2] + fmul v0.4S, v0.4S, v1.4S + fmul v1.4S, v0.4S, v7.4S + rev64 v0.4S, v0.4S + sub x7, x5, x4 + add x0, x0, x4, lsl #3 + add x1, x1, x4, lsl #3 + sub x1, x1, #16 +1: ld1 {v2.4S}, [x1], #16 + ld1 {v3.2S}, [x1] + fmul v4.4S, v2.4S, v1.4S + fmul v5.4S, v2.4S, v0.4S + faddp v4.4S, v4.4S, v4.4S + faddp v5.4S, v5.4S, v5.4S + faddp v4.4S, v4.4S, v4.4S + faddp v5.4S, v5.4S, v5.4S + mov v4.S[1], v5.S[0] + fadd v4.2S, v4.2S, v3.2S + st1 {v4.2S}, [x0], #8 + sub x1, x1, #8 + subs x7, x7, #1 + b.gt 1b + ret +endfunc + +function ff_sbr_hf_g_filt_neon, export=1 + sxtw x3, w3 + sxtw x4, w4 + mov x5, #40*2*4 + add x1, x1, x4, lsl #3 +1: ld1 {v0.2S}, [x1], x5 + ld1 {v1.S}[0], [x2], #4 + fmul v2.4S, v0.4S, v1.S[0] + st1 {v2.2S}, [x0], #8 + subs x3, x3, #1 + b.gt 1b + ret +endfunc + +function ff_sbr_autocorrelate_neon, export=1 + mov x2, #38 + movrel x3, factors + ld1 {v0.4S}, [x3] + movi v1.4S, #0 + movi v2.4S, #0 + movi v3.4S, #0 + ld1 {v4.2S}, [x0], #8 + ld1 {v5.2S}, [x0], #8 + fmul v16.2S, v4.2S, v4.2S + fmul v17.2S, v5.2S, v4.S[0] + fmul v18.2S, v5.2S, v4.S[1] +1: ld1 {v5.D}[1], [x0], #8 + fmla v1.2S, v4.2S, v4.2S + fmla v2.4S, v5.4S, v4.S[0] + fmla v3.4S, v5.4S, v4.S[1] + mov v4.D[0], v5.D[0] + mov v5.D[0], v5.D[1] + subs x2, x2, #1 + b.gt 1b + fmul v19.2S, v4.2S, v4.2S + fmul v20.2S, v5.2S, v4.S[0] + fmul v21.2S, v5.2S, v4.S[1] + fadd v22.4S, v2.4S, v20.4S + fsub v22.4S, v22.4S, v17.4S + fadd v23.4S, v3.4S, v21.4S + fsub v23.4S, v23.4S, v18.4S + rev64 v23.4S, v23.4S + fmul v23.4S, v23.4S, v0.4S + fadd v22.4S, v22.4S, v23.4S + st1 {v22.4S}, [x1], #16 + fadd v23.2S, v1.2S, v19.2S + fsub v23.2S, v23.2S, v16.2S + faddp v23.2S, v23.2S, v23.2S + st1 {v23.S}[0], [x1] + add x1, x1, #8 + rev64 v3.2S, v3.2S + fmul v3.2S, v3.2S, v0.2S + fadd v2.2S, v2.2S, v3.2S + st1 {v2.2S}, [x1] + add x1, x1, #16 + faddp v1.2S, v1.2S, v1.2S + st1 {v1.S}[0], [x1] + ret +endfunc + +.macro apply_noise_common + sxtw x3, w3 + sxtw x5, w5 + movrel x7, X(ff_sbr_noise_table) + add x3, x3, #1 +1: and x3, x3, #0x1ff + add x8, x7, x3, lsl #3 + add x3, x3, #2 + ld1 {v2.4S}, [x0] + ld1 {v3.2S}, [x1], #8 + ld1 {v4.2S}, [x2], #8 + ld1 {v5.4S}, [x8] + mov v6.16B, v2.16B + zip1 v3.4S, v3.4S, v3.4S + zip1 v4.4S, v4.4S, v4.4S + fmla v6.4S, v1.4S, v3.4S + fmla v2.4S, v5.4S, v4.4S + fcmeq v7.4S, v3.4S, #0 + bif v2.16B, v6.16B, v7.16B + st1 {v2.4S}, [x0], #16 + subs x5, x5, #2 + b.gt 1b +.endm + +function ff_sbr_hf_apply_noise_0_neon, export=1 + movrel x9, phi_noise_0 + ld1 {v1.4S}, [x9] + apply_noise_common + ret +endfunc + +function ff_sbr_hf_apply_noise_1_neon, export=1 + movrel x9, phi_noise_1 + and x4, x4, #1 + add x9, x9, x4, lsl #4 + ld1 {v1.4S}, [x9] + apply_noise_common + ret +endfunc + +function ff_sbr_hf_apply_noise_2_neon, export=1 + movrel x9, phi_noise_2 + ld1 {v1.4S}, [x9] + apply_noise_common + ret +endfunc + +function ff_sbr_hf_apply_noise_3_neon, export=1 + movrel x9, phi_noise_3 + and x4, x4, #1 + add x9, x9, x4, lsl #4 + ld1 {v1.4S}, [x9] + apply_noise_common + ret +endfunc diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S new file mode 100644 index 0000000000..5e4d021a97 --- /dev/null +++ b/libavcodec/aarch64/simple_idct_neon.S @@ -0,0 +1,362 @@ +/* + * ARM NEON IDCT + * + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com> + * + * Based on Simple IDCT + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z4c ((1<<(COL_SHIFT-1))/Z4) +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define z1 v0.H[0] +#define z2 v0.H[1] +#define z3 v0.H[2] +#define z4 v0.H[3] +#define z5 v0.H[4] +#define z6 v0.H[5] +#define z7 v0.H[6] +#define z4c v0.H[7] + +const idct_coeff_neon, align=4 + .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c +endconst + +.macro idct_start data + prfm pldl1keep, [\data] + mov x10, x30 + movrel x3, idct_coeff_neon + ld1 {v0.2D}, [x3] +.endm + +.macro idct_end + br x10 +.endm + +.macro smull1 a, b, c + smull \a, \b, \c +.endm + +.macro smlal1 a, b, c + smlal \a, \b, \c +.endm + +.macro smlsl1 a, b, c + smlsl \a, \b, \c +.endm + +.macro idct_col4_top y1, y2, y3, y4, i, l + smull\i v7.4S, \y3\l, z2 + smull\i v16.4S, \y3\l, z6 + smull\i v17.4S, \y2\l, z1 + add v19.4S, v23.4S, v7.4S + smull\i v18.4S, \y2\l, z3 + add v20.4S, v23.4S, v16.4S + smull\i v5.4S, \y2\l, z5 + sub v21.4S, v23.4S, v16.4S + smull\i v6.4S, \y2\l, z7 + sub v22.4S, v23.4S, v7.4S + + smlal\i v17.4S, \y4\l, z3 + smlsl\i v18.4S, \y4\l, z7 + smlsl\i v5.4S, \y4\l, z1 + smlsl\i v6.4S, \y4\l, z5 +.endm + +.macro idct_row4_neon y1, y2, y3, y4, pass + ld1 {\y1\().2D,\y2\().2D}, [x2], #32 + movi v23.4S, #1<<2, lsl #8 + orr v5.16B, \y1\().16B, \y2\().16B + ld1 {\y3\().2D,\y4\().2D}, [x2], #32 + orr v6.16B, \y3\().16B, \y4\().16B + orr v5.16B, v5.16B, v6.16B + mov x3, v5.D[1] + smlal v23.4S, \y1\().4H, z4 + + idct_col4_top \y1, \y2, \y3, \y4, 1, .4H + + cmp x3, #0 + b.eq \pass\()f + + smull2 v7.4S, \y1\().8H, z4 + smlal2 v17.4S, \y2\().8H, z5 + smlsl2 v18.4S, \y2\().8H, z1 + smull2 v16.4S, \y3\().8H, z2 + smlal2 v5.4S, \y2\().8H, z7 + add v19.4S, v19.4S, v7.4S + sub v20.4S, v20.4S, v7.4S + sub v21.4S, v21.4S, v7.4S + add v22.4S, v22.4S, v7.4S + smlal2 v6.4S, \y2\().8H, z3 + smull2 v7.4S, \y3\().8H, z6 + smlal2 v17.4S, \y4\().8H, z7 + smlsl2 v18.4S, \y4\().8H, z5 + smlal2 v5.4S, \y4\().8H, z3 + smlsl2 v6.4S, \y4\().8H, z1 + add v19.4S, v19.4S, v7.4S + sub v20.4S, v20.4S, v16.4S + add v21.4S, v21.4S, v16.4S + sub v22.4S, v22.4S, v7.4S + +\pass: add \y3\().4S, v19.4S, v17.4S + add \y4\().4S, v20.4S, v18.4S + shrn \y1\().4H, \y3\().4S, #ROW_SHIFT + shrn \y2\().4H, \y4\().4S, #ROW_SHIFT + add v7.4S, v21.4S, v5.4S + add v16.4S, v22.4S, v6.4S + shrn \y3\().4H, v7.4S, #ROW_SHIFT + shrn \y4\().4H, v16.4S, #ROW_SHIFT + sub v22.4S, v22.4S, v6.4S + sub v19.4S, v19.4S, v17.4S + sub v21.4S, v21.4S, v5.4S + shrn2 \y1\().8H, v22.4S, #ROW_SHIFT + sub v20.4S, v20.4S, v18.4S + shrn2 \y2\().8H, v21.4S, #ROW_SHIFT + shrn2 \y3\().8H, v20.4S, #ROW_SHIFT + shrn2 \y4\().8H, v19.4S, #ROW_SHIFT + + trn1 v16.8H, \y1\().8H, \y2\().8H + trn2 v17.8H, \y1\().8H, \y2\().8H + trn1 v18.8H, \y3\().8H, \y4\().8H + trn2 v19.8H, \y3\().8H, \y4\().8H + trn1 \y1\().4S, v16.4S, v18.4S + trn1 \y2\().4S, v17.4S, v19.4S + trn2 \y3\().4S, v16.4S, v18.4S + trn2 \y4\().4S, v17.4S, v19.4S +.endm + +.macro declare_idct_col4_neon i, l +function idct_col4_neon\i + dup v23.4H, z4c +.if \i == 1 + add v23.4H, v23.4H, v24.4H +.else + mov v5.D[0], v24.D[1] + add v23.4H, v23.4H, v5.4H +.endif + smull v23.4S, v23.4H, z4 + + idct_col4_top v24, v25, v26, v27, \i, \l + + mov x4, v28.D[\i - 1] + mov x5, v29.D[\i - 1] + cmp x4, #0 + b.eq 1f + + smull\i v7.4S, v28\l, z4 + add v19.4S, v19.4S, v7.4S + sub v20.4S, v20.4S, v7.4S + sub v21.4S, v21.4S, v7.4S + add v22.4S, v22.4S, v7.4S + +1: mov x4, v30.D[\i - 1] + cmp x5, #0 + b.eq 2f + + smlal\i v17.4S, v29\l, z5 + smlsl\i v18.4S, v29\l, z1 + smlal\i v5.4S, v29\l, z7 + smlal\i v6.4S, v29\l, z3 + +2: mov x5, v31.D[\i - 1] + cmp x4, #0 + b.eq 3f + + smull\i v7.4S, v30\l, z6 + smull\i v16.4S, v30\l, z2 + add v19.4S, v19.4S, v7.4S + sub v22.4S, v22.4S, v7.4S + sub v20.4S, v20.4S, v16.4S + add v21.4S, v21.4S, v16.4S + +3: cmp x5, #0 + b.eq 4f + + smlal\i v17.4S, v31\l, z7 + smlsl\i v18.4S, v31\l, z5 + smlal\i v5.4S, v31\l, z3 + smlsl\i v6.4S, v31\l, z1 + +4: addhn v7.4H, v19.4S, v17.4S + addhn2 v7.8H, v20.4S, v18.4S + subhn v18.4H, v20.4S, v18.4S + subhn2 v18.8H, v19.4S, v17.4S + + addhn v16.4H, v21.4S, v5.4S + addhn2 v16.8H, v22.4S, v6.4S + subhn v17.4H, v22.4S, v6.4S + subhn2 v17.8H, v21.4S, v5.4S + + ret +endfunc +.endm + +declare_idct_col4_neon 1, .4H +declare_idct_col4_neon 2, .8H + +function ff_simple_idct_put_neon, export=1 + idct_start x2 + + idct_row4_neon v24, v25, v26, v27, 1 + idct_row4_neon v28, v29, v30, v31, 2 + bl idct_col4_neon1 + + sqshrun v1.8B, v7.8H, #COL_SHIFT-16 + sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16 + sqshrun v3.8B, v17.8H, #COL_SHIFT-16 + sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16 + + bl idct_col4_neon2 + + sqshrun v2.8B, v7.8H, #COL_SHIFT-16 + sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16 + sqshrun v4.8B, v17.8H, #COL_SHIFT-16 + sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16 + + zip1 v16.4S, v1.4S, v2.4S + zip2 v17.4S, v1.4S, v2.4S + + st1 {v16.D}[0], [x0], x1 + st1 {v16.D}[1], [x0], x1 + + zip1 v18.4S, v3.4S, v4.4S + zip2 v19.4S, v3.4S, v4.4S + + st1 {v17.D}[0], [x0], x1 + st1 {v17.D}[1], [x0], x1 + st1 {v18.D}[0], [x0], x1 + st1 {v18.D}[1], [x0], x1 + st1 {v19.D}[0], [x0], x1 + st1 {v19.D}[1], [x0], x1 + + idct_end +endfunc + +function ff_simple_idct_add_neon, export=1 + idct_start x2 + + idct_row4_neon v24, v25, v26, v27, 1 + idct_row4_neon v28, v29, v30, v31, 2 + bl idct_col4_neon1 + + sshr v1.8H, v7.8H, #COL_SHIFT-16 + sshr v2.8H, v16.8H, #COL_SHIFT-16 + sshr v3.8H, v17.8H, #COL_SHIFT-16 + sshr v4.8H, v18.8H, #COL_SHIFT-16 + + bl idct_col4_neon2 + + sshr v7.8H, v7.8H, #COL_SHIFT-16 + sshr v16.8H, v16.8H, #COL_SHIFT-16 + sshr v17.8H, v17.8H, #COL_SHIFT-16 + sshr v18.8H, v18.8H, #COL_SHIFT-16 + + mov x9, x0 + ld1 {v19.D}[0], [x0], x1 + zip1 v23.2D, v1.2D, v7.2D + zip2 v24.2D, v1.2D, v7.2D + ld1 {v19.D}[1], [x0], x1 + zip1 v25.2D, v2.2D, v16.2D + zip2 v26.2D, v2.2D, v16.2D + ld1 {v20.D}[0], [x0], x1 + zip1 v27.2D, v3.2D, v17.2D + zip2 v28.2D, v3.2D, v17.2D + ld1 {v20.D}[1], [x0], x1 + zip1 v29.2D, v4.2D, v18.2D + zip2 v30.2D, v4.2D, v18.2D + ld1 {v21.D}[0], [x0], x1 + uaddw v23.8H, v23.8H, v19.8B + uaddw2 v24.8H, v24.8H, v19.16B + ld1 {v21.D}[1], [x0], x1 + sqxtun v23.8B, v23.8H + sqxtun2 v23.16B, v24.8H + ld1 {v22.D}[0], [x0], x1 + uaddw v24.8H, v25.8H, v20.8B + uaddw2 v25.8H, v26.8H, v20.16B + ld1 {v22.D}[1], [x0], x1 + sqxtun v24.8B, v24.8H + sqxtun2 v24.16B, v25.8H + st1 {v23.D}[0], [x9], x1 + uaddw v25.8H, v27.8H, v21.8B + uaddw2 v26.8H, v28.8H, v21.16B + st1 {v23.D}[1], [x9], x1 + sqxtun v25.8B, v25.8H + sqxtun2 v25.16B, v26.8H + st1 {v24.D}[0], [x9], x1 + uaddw v26.8H, v29.8H, v22.8B + uaddw2 v27.8H, v30.8H, v22.16B + st1 {v24.D}[1], [x9], x1 + sqxtun v26.8B, v26.8H + sqxtun2 v26.16B, v27.8H + st1 {v25.D}[0], [x9], x1 + st1 {v25.D}[1], [x9], x1 + st1 {v26.D}[0], [x9], x1 + st1 {v26.D}[1], [x9], x1 + + idct_end +endfunc + +function ff_simple_idct_neon, export=1 + idct_start x0 + + mov x2, x0 + idct_row4_neon v24, v25, v26, v27, 1 + idct_row4_neon v28, v29, v30, v31, 2 + sub x2, x2, #128 + bl idct_col4_neon1 + + sshr v1.8H, v7.8H, #COL_SHIFT-16 + sshr v2.8H, v16.8H, #COL_SHIFT-16 + sshr v3.8H, v17.8H, #COL_SHIFT-16 + sshr v4.8H, v18.8H, #COL_SHIFT-16 + + bl idct_col4_neon2 + + sshr v7.8H, v7.8H, #COL_SHIFT-16 + sshr v16.8H, v16.8H, #COL_SHIFT-16 + sshr v17.8H, v17.8H, #COL_SHIFT-16 + sshr v18.8H, v18.8H, #COL_SHIFT-16 + + zip1 v23.2D, v1.2D, v7.2D + zip2 v24.2D, v1.2D, v7.2D + st1 {v23.2D,v24.2D}, [x2], #32 + zip1 v25.2D, v2.2D, v16.2D + zip2 v26.2D, v2.2D, v16.2D + st1 {v25.2D,v26.2D}, [x2], #32 + zip1 v27.2D, v3.2D, v17.2D + zip2 v28.2D, v3.2D, v17.2D + st1 {v27.2D,v28.2D}, [x2], #32 + zip1 v29.2D, v4.2D, v18.2D + zip2 v30.2D, v4.2D, v18.2D + st1 {v29.2D,v30.2D}, [x2], #32 + + idct_end +endfunc diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/synth_filter_init.c index d3430d045c..767b01112a 100644 --- a/libavcodec/aarch64/dcadsp_init.c +++ b/libavcodec/aarch64/synth_filter_init.c @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -23,8 +23,8 @@ #include "libavutil/aarch64/cpu.h" #include "libavutil/attributes.h" #include "libavutil/internal.h" -#include "libavcodec/dcadsp.h" #include "libavcodec/fft.h" +#include "libavcodec/synth_filter.h" #include "asm-offsets.h" @@ -32,25 +32,12 @@ AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF); #endif -void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs); -void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs); - void ff_synth_filter_float_neon(FFTContext *imdct, float *synth_buf_ptr, int *synth_buf_offset, float synth_buf2[32], const float window[512], float out[32], const float in[32], float scale); -av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->lfe_fir[0] = ff_dca_lfe_fir0_neon; - s->lfe_fir[1] = ff_dca_lfe_fir1_neon; - } -} - av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s) { int cpu_flags = av_get_cpu_flags(); diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S index b001c737da..8fcd71f252 100644 --- a/libavcodec/aarch64/synth_filter_neon.S +++ b/libavcodec/aarch64/synth_filter_neon.S @@ -2,20 +2,20 @@ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c index ab97a97740..13dfd74940 100644 --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/videodsp.S b/libavcodec/aarch64/videodsp.S index 7ce5a7ddf6..24067cc2af 100644 --- a/libavcodec/aarch64/videodsp.S +++ b/libavcodec/aarch64/videodsp.S @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/videodsp_init.c b/libavcodec/aarch64/videodsp_init.c index 59b697d4f4..6f667a6d3e 100644 --- a/libavcodec/aarch64/videodsp_init.c +++ b/libavcodec/aarch64/videodsp_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/vorbisdsp_init.c b/libavcodec/aarch64/vorbisdsp_init.c index 3559b54a30..c796f95e61 100644 --- a/libavcodec/aarch64/vorbisdsp_init.c +++ b/libavcodec/aarch64/vorbisdsp_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/vorbisdsp_neon.S b/libavcodec/aarch64/vorbisdsp_neon.S index 11f71f1d89..e76feebc54 100644 --- a/libavcodec/aarch64/vorbisdsp_neon.S +++ b/libavcodec/aarch64/vorbisdsp_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/vp8dsp.h b/libavcodec/aarch64/vp8dsp.h new file mode 100644 index 0000000000..ea7665dcc8 --- /dev/null +++ b/libavcodec/aarch64/vp8dsp.h @@ -0,0 +1,70 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AARCH64_VP8DSP_H +#define AVCODEC_AARCH64_VP8DSP_H + +#include "libavcodec/vp8dsp.h" + +#define VP8_LF_Y(hv, inner, opt) \ + void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim_E, int flim_I, \ + int hev_thresh) + +#define VP8_LF_UV(hv, inner, opt) \ + void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t stride, \ + int flim_E, int flim_I, \ + int hev_thresh) + +#define VP8_LF_SIMPLE(hv, opt) \ + void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim) + +#define VP8_LF_HV(inner, opt) \ + VP8_LF_Y(h, inner, opt); \ + VP8_LF_Y(v, inner, opt); \ + VP8_LF_UV(h, inner, opt); \ + VP8_LF_UV(v, inner, opt) + +#define VP8_LF(opt) \ + VP8_LF_HV(, opt); \ + VP8_LF_HV(_inner, opt); \ + VP8_LF_SIMPLE(h, opt); \ + VP8_LF_SIMPLE(v, opt) + +#define VP8_MC(n, opt) \ + void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int h, int x, int y) + +#define VP8_EPEL(w, opt) \ + VP8_MC(pixels ## w, opt); \ + VP8_MC(epel ## w ## _h4, opt); \ + VP8_MC(epel ## w ## _h6, opt); \ + VP8_MC(epel ## w ## _v4, opt); \ + VP8_MC(epel ## w ## _h4v4, opt); \ + VP8_MC(epel ## w ## _h6v4, opt); \ + VP8_MC(epel ## w ## _v6, opt); \ + VP8_MC(epel ## w ## _h4v6, opt); \ + VP8_MC(epel ## w ## _h6v6, opt) + +#endif /* AVCODEC_AARCH64_VP8DSP_H */ diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c new file mode 100644 index 0000000000..dbc07408a0 --- /dev/null +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -0,0 +1,77 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/vp8dsp.h" +#include "vp8dsp.h" + +void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]); + +void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); + +VP8_LF(neon); + +VP8_EPEL(16, neon); +VP8_EPEL(8, neon); + + +av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) +{ + if (!have_neon(av_get_cpu_flags())) { + return; + } + dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; + dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon; + dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon; + dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon; + + dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; + dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; + dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; + dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon; +} + +av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) +{ + if (!have_neon(av_get_cpu_flags())) { + return; + } + + dsp->vp8_idct_add = ff_vp8_idct_add_neon; + dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon; + dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon; + + dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; + dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon; + dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon; + dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon; + + dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon; + dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon; + dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon; + dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon; + + dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon; + dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon; +} diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S new file mode 100644 index 0000000000..0ce9e301de --- /dev/null +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -0,0 +1,1031 @@ +/* + * VP8 NEON optimisations + * + * Copyright (c) 2010 Rob Clark <rob@ti.com> + * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +function ff_vp8_idct_add_neon, export=1 + ld1 {v0.8b - v3.8b}, [x1] + mov w4, #20091 + movk w4, #35468/2, lsl #16 + dup v4.2s, w4 + + smull v26.4s, v1.4h, v4.h[0] + smull v27.4s, v3.4h, v4.h[0] + sqdmulh v20.4h, v1.4h, v4.h[1] + sqdmulh v23.4h, v3.4h, v4.h[1] + sqshrn v21.4h, v26.4s, #16 + sqshrn v22.4h, v27.4s, #16 + add v21.4h, v21.4h, v1.4h + add v22.4h, v22.4h, v3.4h + + add v16.4h, v0.4h, v2.4h + sub v17.4h, v0.4h, v2.4h + + add v18.4h, v21.4h, v23.4h + sub v19.4h, v20.4h, v22.4h + + add v0.4h, v16.4h, v18.4h + add v1.4h, v17.4h, v19.4h + sub v3.4h, v16.4h, v18.4h + sub v2.4h, v17.4h, v19.4h + + transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7 + + movi v29.8h, #0 + smull v26.4s, v1.4h, v4.h[0] + st1 {v29.8h}, [x1], #16 + smull v27.4s, v3.4h, v4.h[0] + st1 {v29.16b}, [x1] + sqdmulh v21.4h, v1.4h, v4.h[1] + sqdmulh v23.4h, v3.4h, v4.h[1] + sqshrn v20.4h, v26.4s, #16 + sqshrn v22.4h, v27.4s, #16 + add v20.4h, v20.4h, v1.4h + add v22.4h, v22.4h, v3.4h + add v16.4h, v0.4h, v2.4h + sub v17.4h, v0.4h, v2.4h + + add v18.4h, v20.4h, v23.4h + ld1 {v24.d}[0], [x0], x2 + zip1 v16.2d, v16.2d, v17.2d + sub v19.4h, v21.4h, v22.4h + ld1 {v25.d}[0], [x0], x2 + zip1 v18.2d, v18.2d, v19.2d + add v0.8h, v16.8h, v18.8h + ld1 {v25.d}[1], [x0], x2 + sub v1.8h, v16.8h, v18.8h + ld1 {v24.d}[1], [x0], x2 + srshr v0.8h, v0.8h, #3 + trn1 v24.4s, v24.4s, v25.4s + srshr v1.8h, v1.8h, #3 + sub x0, x0, x2, lsl #2 + + ext v1.16b, v1.16b, v1.16b, #8 + trn1 v3.2d, v0.2d, v1.2d + trn2 v0.2d, v0.2d, v1.2d + trn1 v1.8h, v3.8h, v0.8h + trn2 v3.8h, v3.8h, v0.8h + uzp1 v0.4s, v1.4s, v3.4s + uzp2 v1.4s, v3.4s, v1.4s + + uaddw v0.8h, v0.8h, v24.8b + uaddw2 v1.8h, v1.8h, v24.16b + sqxtun v0.8b, v0.8h + sqxtun2 v0.16b, v1.8h + st1 {v0.s}[0], [x0], x2 + st1 {v0.s}[1], [x0], x2 + st1 {v0.s}[3], [x0], x2 + st1 {v0.s}[2], [x0], x2 + + ret +endfunc + +function ff_vp8_idct_dc_add4y_neon, export=1 + movi v0.16b, #0 + mov x3, #32 + ld1r {v16.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v17.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + zip1 v16.2d, v16.2d, v17.2d + ld1r {v18.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v19.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + zip1 v18.2d, v18.2d, v19.2d + srshr v16.8h, v16.8h, #3 // dc >>= 3 + ld1 {v0.16b}, [x0], x2 + srshr v18.8h, v18.8h, #3 + ld1 {v1.16b}, [x0], x2 + uaddw v20.8h, v16.8h, v0.8b + ld1 {v2.16b}, [x0], x2 + uaddw2 v0.8h, v18.8h, v0.16b + ld1 {v3.16b}, [x0], x2 + uaddw v21.8h, v16.8h, v1.8b + uaddw2 v1.8h, v18.8h, v1.16b + uaddw v22.8h, v16.8h, v2.8b + uaddw2 v2.8h, v18.8h, v2.16b + uaddw v23.8h, v16.8h, v3.8b + uaddw2 v3.8h, v18.8h, v3.16b + sub x0, x0, x2, lsl #2 + sqxtun v20.8b, v20.8h + sqxtun2 v20.16b, v0.8h + sqxtun v21.8b, v21.8h + sqxtun2 v21.16b, v1.8h + sqxtun v22.8b, v22.8h + st1 {v20.16b}, [x0], x2 + sqxtun2 v22.16b, v2.8h + st1 {v21.16b}, [x0], x2 + sqxtun v23.8b, v23.8h + st1 {v22.16b}, [x0], x2 + sqxtun2 v23.16b, v3.8h + st1 {v23.16b}, [x0], x2 + + ret +endfunc + +function ff_vp8_idct_dc_add_neon, export=1 + mov w3, #0 + ld1r {v2.8h}, [x1] + strh w3, [x1] + srshr v2.8h, v2.8h, #3 + ld1 {v0.s}[0], [x0], x2 + ld1 {v0.s}[1], [x0], x2 + uaddw v3.8h, v2.8h, v0.8b + ld1 {v1.s}[0], [x0], x2 + ld1 {v1.s}[1], [x0], x2 + uaddw v4.8h, v2.8h, v1.8b + sqxtun v0.8b, v3.8h + sqxtun v1.8b, v4.8h + sub x0, x0, x2, lsl #2 + st1 {v0.s}[0], [x0], x2 + st1 {v0.s}[1], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v1.s}[1], [x0], x2 + ret +endfunc + +// Register layout: +// P3..Q3 -> v0..v7 +// flim_E -> v22 +// flim_I -> v23 +// hev_thresh -> x5 +// +.macro vp8_loop_filter, inner=0, simple=0, hev_thresh + .if \simple + uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) + uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) + uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 + ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 + uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) + movi v21.16b, #0x80 + cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim + .else + // calculate hev and normal_limit: + uabd v20.16b, v2.16b, v3.16b // abs(P1-P0) + uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0) + uabd v18.16b, v0.16b, v1.16b // abs(P3-P2) + uabd v19.16b, v1.16b, v2.16b // abs(P2-P1) + cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I + cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I + cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I + cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I + and v16.16b, v17.16b, v16.16b + uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2) + and v16.16b, v16.16b, v19.16b + uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1) + and v16.16b, v16.16b, v18.16b + cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I + cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I + uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) + uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) + and v16.16b, v16.16b, v18.16b + uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 + and v16.16b, v16.16b, v19.16b + ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 + dup v23.16b, \hev_thresh // hev_thresh + uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) + cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh + cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E + cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh + and v16.16b, v16.16b, v19.16b + movi v21.16b, #0x80 + orr v17.16b, v20.16b, v22.16b + .endif + + // at this point: + // v16: normal_limit + // v17: hev + + // convert to signed value: + eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80 + eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80 + + movi v20.8h, #3 + ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0 + ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit) + eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80 + mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0) + mul v19.8h, v19.8h, v20.8h + + sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1) + movi v22.16b, #4 + movi v23.16b, #3 + .if \inner + and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1) + .endif + saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1) + saddw2 v19.8h, v19.8h, v20.16b + sqxtn v18.8b, v18.8h // narrow result back into v18 + sqxtn2 v18.16b, v19.8h + .if !\inner && !\simple + eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80 + eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80 + .endif + and v18.16b, v18.16b, v16.16b // w &= normal_limit + + // registers used at this point.. + // v0 -> P3 (don't corrupt) + // v1-v6 -> PS2-QS2 + // v7 -> Q3 (don't corrupt) + // v17 -> hev + // v18 -> w + // v21 -> #0x80 + // v22 -> #4 + // v23 -> #3 + // v16, v19, v29 -> unused + // + // filter_common: is4tap==1 + // c1 = clamp(w + 4) >> 3; + // c2 = clamp(w + 3) >> 3; + // Q0 = s2u(QS0 - c1); + // P0 = s2u(PS0 + c2); + + .if \simple + sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + .elseif \inner + // the !is4tap case of filter_common, only used for inner blocks + // c3 = ((c1&~hev) + 1) >> 1; + // Q1 = s2u(QS1 - c3); + // P1 = s2u(PS1 + c3); + sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + bic v19.16b, v19.16b, v17.16b // c1 & ~hev + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + srshr v19.16b, v19.16b, #1 // c3 >>= 1 + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3) + sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3) + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + .else + and v20.16b, v18.16b, v17.16b // w & hev + sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + bic v18.16b, v18.16b, v17.16b // w &= ~hev + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + + // filter_mbedge: + // a = clamp((27*w + 63) >> 7); + // Q0 = s2u(QS0 - a); + // P0 = s2u(PS0 + a); + // a = clamp((18*w + 63) >> 7); + // Q1 = s2u(QS1 - a); + // P1 = s2u(PS1 + a); + // a = clamp((9*w + 63) >> 7); + // Q2 = s2u(QS2 - a); + // P2 = s2u(PS2 + a); + movi v17.8h, #63 + sshll v22.8h, v18.8b, #3 + sshll2 v23.8h, v18.16b, #3 + saddw v22.8h, v22.8h, v18.8b + saddw2 v23.8h, v23.8h, v18.16b + add v16.8h, v17.8h, v22.8h + add v17.8h, v17.8h, v23.8h // 9*w + 63 + add v19.8h, v16.8h, v22.8h + add v20.8h, v17.8h, v23.8h // 18*w + 63 + add v22.8h, v19.8h, v22.8h + add v23.8h, v20.8h, v23.8h // 27*w + 63 + sqshrn v16.8b, v16.8h, #7 + sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7) + sqshrn v19.8b, v19.8h, #7 + sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7) + sqshrn v22.8b, v22.8h, #7 + sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7) + sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a) + sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a) + sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a) + sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a) + sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a) + sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a) + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80 + eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80 + .endif +.endm + +.macro vp8_v_loop_filter16 name, inner=0, simple=0 +function ff_vp8_v_loop_filter16\name\()_neon, export=1 + sub x0, x0, x1, lsl #1+!\simple + + // Load pixels: + .if !\simple + ld1 {v0.16b}, [x0], x1 // P3 + ld1 {v1.16b}, [x0], x1 // P2 + .endif + ld1 {v2.16b}, [x0], x1 // P1 + ld1 {v3.16b}, [x0], x1 // P0 + ld1 {v4.16b}, [x0], x1 // Q0 + ld1 {v5.16b}, [x0], x1 // Q1 + .if !\simple + ld1 {v6.16b}, [x0], x1 // Q2 + ld1 {v7.16b}, [x0] // Q3 + dup v23.16b, w3 // flim_I + .endif + dup v22.16b, w2 // flim_E + + vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 + + // back up to P2: dst -= stride * 6 + sub x0, x0, x1, lsl #2 + .if !\simple + sub x0, x0, x1, lsl #1 + + // Store pixels: + st1 {v1.16b}, [x0], x1 // P2 + .endif + st1 {v2.16b}, [x0], x1 // P1 + st1 {v3.16b}, [x0], x1 // P0 + st1 {v4.16b}, [x0], x1 // Q0 + st1 {v5.16b}, [x0], x1 // Q1 + .if !\simple + st1 {v6.16b}, [x0] // Q2 + .endif + + ret +endfunc +.endm + +vp8_v_loop_filter16 +vp8_v_loop_filter16 _inner, inner=1 +vp8_v_loop_filter16 _simple, simple=1 + +.macro vp8_v_loop_filter8uv name, inner=0 +function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 + sub x0, x0, x2, lsl #2 + sub x1, x1, x2, lsl #2 + // Load pixels: + ld1 {v0.d}[0], [x0], x2 // P3 + ld1 {v0.d}[1], [x1], x2 // P3 + ld1 {v1.d}[0], [x0], x2 // P2 + ld1 {v1.d}[1], [x1], x2 // P2 + ld1 {v2.d}[0], [x0], x2 // P1 + ld1 {v2.d}[1], [x1], x2 // P1 + ld1 {v3.d}[0], [x0], x2 // P0 + ld1 {v3.d}[1], [x1], x2 // P0 + ld1 {v4.d}[0], [x0], x2 // Q0 + ld1 {v4.d}[1], [x1], x2 // Q0 + ld1 {v5.d}[0], [x0], x2 // Q1 + ld1 {v5.d}[1], [x1], x2 // Q1 + ld1 {v6.d}[0], [x0], x2 // Q2 + ld1 {v6.d}[1], [x1], x2 // Q2 + ld1 {v7.d}[0], [x0] // Q3 + ld1 {v7.d}[1], [x1] // Q3 + + dup v22.16b, w3 // flim_E + dup v23.16b, w4 // flim_I + + vp8_loop_filter inner=\inner, hev_thresh=w5 + + // back up to P2: u,v -= stride * 6 + sub x0, x0, x2, lsl #2 + sub x1, x1, x2, lsl #2 + sub x0, x0, x2, lsl #1 + sub x1, x1, x2, lsl #1 + + // Store pixels: + + st1 {v1.d}[0], [x0], x2 // P2 + st1 {v1.d}[1], [x1], x2 // P2 + st1 {v2.d}[0], [x0], x2 // P1 + st1 {v2.d}[1], [x1], x2 // P1 + st1 {v3.d}[0], [x0], x2 // P0 + st1 {v3.d}[1], [x1], x2 // P0 + st1 {v4.d}[0], [x0], x2 // Q0 + st1 {v4.d}[1], [x1], x2 // Q0 + st1 {v5.d}[0], [x0], x2 // Q1 + st1 {v5.d}[1], [x1], x2 // Q1 + st1 {v6.d}[0], [x0] // Q2 + st1 {v6.d}[1], [x1] // Q2 + + ret +endfunc +.endm + +vp8_v_loop_filter8uv +vp8_v_loop_filter8uv _inner, inner=1 + +.macro vp8_h_loop_filter16 name, inner=0, simple=0 +function ff_vp8_h_loop_filter16\name\()_neon, export=1 + + sub x0, x0, #4 + // Load pixels: + ld1 {v0.d}[0], [x0], x1 + ld1 {v1.d}[0], [x0], x1 + ld1 {v2.d}[0], [x0], x1 + ld1 {v3.d}[0], [x0], x1 + ld1 {v4.d}[0], [x0], x1 + ld1 {v5.d}[0], [x0], x1 + ld1 {v6.d}[0], [x0], x1 + ld1 {v7.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v3.d}[1], [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v5.d}[1], [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v7.d}[1], [x0], x1 + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + dup v22.16b, w2 // flim_E + .if !\simple + dup v23.16b, w3 // flim_I + .endif + + vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 + + sub x0, x0, x1, lsl #4 // backup 16 rows + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + // Store pixels: + st1 {v0.d}[0], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v3.d}[0], [x0], x1 + st1 {v4.d}[0], [x0], x1 + st1 {v5.d}[0], [x0], x1 + st1 {v6.d}[0], [x0], x1 + st1 {v7.d}[0], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 + st1 {v3.d}[1], [x0], x1 + st1 {v4.d}[1], [x0], x1 + st1 {v5.d}[1], [x0], x1 + st1 {v6.d}[1], [x0], x1 + st1 {v7.d}[1], [x0] + + ret +endfunc +.endm + +vp8_h_loop_filter16 +vp8_h_loop_filter16 _inner, inner=1 +vp8_h_loop_filter16 _simple, simple=1 + +.macro vp8_h_loop_filter8uv name, inner=0 +function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 + sub x0, x0, #4 + sub x1, x1, #4 + + // Load pixels: + ld1 {v0.d}[0], [x0], x2 // load u + ld1 {v0.d}[1], [x1], x2 // load v + ld1 {v1.d}[0], [x0], x2 + ld1 {v1.d}[1], [x1], x2 + ld1 {v2.d}[0], [x0], x2 + ld1 {v2.d}[1], [x1], x2 + ld1 {v3.d}[0], [x0], x2 + ld1 {v3.d}[1], [x1], x2 + ld1 {v4.d}[0], [x0], x2 + ld1 {v4.d}[1], [x1], x2 + ld1 {v5.d}[0], [x0], x2 + ld1 {v5.d}[1], [x1], x2 + ld1 {v6.d}[0], [x0], x2 + ld1 {v6.d}[1], [x1], x2 + ld1 {v7.d}[0], [x0], x2 + ld1 {v7.d}[1], [x1], x2 + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + dup v22.16b, w3 // flim_E + dup v23.16b, w4 // flim_I + + vp8_loop_filter inner=\inner, hev_thresh=w5 + + sub x0, x0, x2, lsl #3 // backup u 8 rows + sub x1, x1, x2, lsl #3 // backup v 8 rows + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + // Store pixels: + st1 {v0.d}[0], [x0], x2 // load u + st1 {v0.d}[1], [x1], x2 // load v + st1 {v1.d}[0], [x0], x2 + st1 {v1.d}[1], [x1], x2 + st1 {v2.d}[0], [x0], x2 + st1 {v2.d}[1], [x1], x2 + st1 {v3.d}[0], [x0], x2 + st1 {v3.d}[1], [x1], x2 + st1 {v4.d}[0], [x0], x2 + st1 {v4.d}[1], [x1], x2 + st1 {v5.d}[0], [x0], x2 + st1 {v5.d}[1], [x1], x2 + st1 {v6.d}[0], [x0], x2 + st1 {v6.d}[1], [x1], x2 + st1 {v7.d}[0], [x0] + st1 {v7.d}[1], [x1] + + ret + +endfunc +.endm + +vp8_h_loop_filter8uv +vp8_h_loop_filter8uv _inner, inner=1 + + +function ff_put_vp8_pixels16_neon, export=1 +1: + subs w4, w4, #4 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x2], x3 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x2], x3 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + bgt 1b + ret +endfunc + +function ff_put_vp8_pixels8_neon, export=1 +1: + subs w4, w4, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v0.d}[1], [x2], x3 + ld1 {v1.8b}, [x2], x3 + ld1 {v1.d}[1], [x2], x3 + st1 {v0.8b}, [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v1.d}[1], [x0], x1 + bgt 1b + ret +endfunc + +/* 4/6-tap 8th-pel MC */ + +.macro vp8_epel8_h6 d, s0, s1 + ext v22.8b, \s0\().8b, \s1\().8b, #1 + uxtl v18.8h, \s0\().8b + ext v23.8b, \s0\().8b, \s1\().8b, #2 + uxtl v19.8h, v22.8b + ext v24.8b, \s0\().8b, \s1\().8b, #3 + uxtl v21.8h, v23.8b + ext v25.8b, \s0\().8b, \s1\().8b, #4 + uxtl v22.8h, v24.8b + ext v26.8b, \s0\().8b, \s1\().8b, #5 + uxtl v25.8h, v25.8b + mul v21.8h, v21.8h, v0.h[2] + uxtl v26.8h, v26.8b + mul v22.8h, v22.8h, v0.h[3] + mls v21.8h, v19.8h, v0.h[1] + mls v22.8h, v25.8h, v0.h[4] + mla v21.8h, v18.8h, v0.h[0] + mla v22.8h, v26.8h, v0.h[5] + sqadd v22.8h, v21.8h, v22.8h + sqrshrun \d\().8b, v22.8h, #7 +.endm + +.macro vp8_epel16_h6 d0, v0, v1 + ext v22.16b, \v0\().16b, \v1\().16b, #3 + ext v23.16b, \v0\().16b, \v1\().16b, #4 + uxtl v19.8h, v22.8b + uxtl2 v22.8h, v22.16b + ext v3.16b, \v0\().16b, \v1\().16b, #2 + uxtl v20.8h, v23.8b + uxtl2 v23.8h, v23.16b + ext v16.16b, \v0\().16b, \v1\().16b, #1 + uxtl v18.8h, v3.8b + uxtl2 v3.8h, v3.16b + ext v2.16b, \v0\().16b, \v1\().16b, #5 + uxtl v21.8h, v2.8b + uxtl2 v2.8h, v2.16b + uxtl v17.8h, v16.8b + uxtl2 v16.8h, v16.16b + mul v19.8h, v19.8h, v0.h[3] + mul v18.8h, v18.8h, v0.h[2] + mul v3.8h, v3.8h, v0.h[2] + mul v22.8h, v22.8h, v0.h[3] + mls v19.8h, v20.8h, v0.h[4] + uxtl v20.8h, \v0\().8b + uxtl2 v1.8h, \v0\().16b + mls v18.8h, v17.8h, v0.h[1] + mls v3.8h, v16.8h, v0.h[1] + mls v22.8h, v23.8h, v0.h[4] + mla v18.8h, v20.8h, v0.h[0] + mla v19.8h, v21.8h, v0.h[5] + mla v3.8h, v1.8h, v0.h[0] + mla v22.8h, v2.8h, v0.h[5] + sqadd v19.8h, v18.8h, v19.8h + sqadd v22.8h, v3.8h, v22.8h + sqrshrun \d0\().8b, v19.8h, #7 + sqrshrun2 \d0\().16b, v22.8h, #7 +.endm + +.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 + uxtl \s2\().8h, \s2\().8b + uxtl \s3\().8h, \s3\().8b + uxtl \s1\().8h, \s1\().8b + uxtl \s4\().8h, \s4\().8b + uxtl \s0\().8h, \s0\().8b + uxtl \s5\().8h, \s5\().8b + mul \s2\().8h, \s2\().8h, v0.h[2] + mul \s3\().8h, \s3\().8h, v0.h[3] + mls \s2\().8h, \s1\().8h, v0.h[1] + mls \s3\().8h, \s4\().8h, v0.h[4] + mla \s2\().8h, \s0\().8h, v0.h[0] + mla \s3\().8h, \s5\().8h, v0.h[5] + sqadd \s3\().8h, \s2\().8h, \s3\().8h + sqrshrun \d0\().8b, \s3\().8h, #7 +.endm + +.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 + uxtl \s0\().8h, \s0\().8b + uxtl \s3\().8h, \s3\().8b + uxtl \s6\().8h, \s6\().8b + uxtl \s1\().8h, \s1\().8b + uxtl \s4\().8h, \s4\().8b + uxtl \s2\().8h, \s2\().8b + uxtl \s5\().8h, \s5\().8b + mul \s0\().8h, \s0\().8h, v0.h[0] + mul v31.8h , \s3\().8h, v0.h[3] + mul \s3\().8h, \s3\().8h, v0.h[2] + mul \s6\().8h, \s6\().8h, v0.h[5] + + mls \s0\().8h, \s1\().8h, v0.h[1] + mls v31.8h , \s4\().8h, v0.h[4] + mls \s3\().8h, \s2\().8h, v0.h[1] + mls \s6\().8h, \s5\().8h, v0.h[4] + + mla \s0\().8h, \s2\().8h, v0.h[2] + mla v31.8h , \s5\().8h, v0.h[5] + mla \s3\().8h, \s1\().8h, v0.h[0] + mla \s6\().8h, \s4\().8h, v0.h[3] + sqadd v31.8h , \s0\().8h, v31.8h + sqadd \s6\().8h, \s3\().8h, \s6\().8h + sqrshrun \d0\().8b, v31.8h, #7 + sqrshrun \d1\().8b, \s6\().8h, #7 +.endm + +.macro vp8_epel8_h4 d, v0, v1 + ext v22.8b, \v0\().8b, \v1\().8b, #1 + uxtl v19.8h, \v0\().8b + ext v23.8b, \v0\().8b, \v1\().8b, #2 + uxtl v20.8h, v22.8b + ext v25.8b, \v0\().8b, \v1\().8b, #3 + uxtl v22.8h, v23.8b + uxtl v25.8h, v25.8b + mul v20.8h, v20.8h, v0.h[2] + mul v22.8h, v22.8h, v0.h[3] + mls v20.8h, v19.8h, v0.h[1] + mls v22.8h, v25.8h, v0.h[4] + sqadd v22.8h, v20.8h, v22.8h + sqrshrun \d\().8b, v22.8h, #7 +.endm + +.macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4 + uxtl \s0\().8h, \s0\().8b + uxtl \s1\().8h, \s1\().8b + uxtl \s2\().8h, \s2\().8b + uxtl \s3\().8h, \s3\().8b + uxtl \s4\().8h, \s4\().8b + mul v21.8h, \s1\().8h, v0.h[2] + mul v23.8h, \s2\().8h, v0.h[3] + mul \s2\().8h, \s2\().8h, v0.h[2] + mul v22.8h, \s3\().8h, v0.h[3] + mls v21.8h, \s0\().8h, v0.h[1] + mls v23.8h, \s3\().8h, v0.h[4] + mls \s2\().8h, \s1\().8h, v0.h[1] + mls v22.8h, \s4\().8h, v0.h[4] + sqadd v21.8h, v21.8h, v23.8h + sqadd \s2\().8h, \s2\().8h, v22.8h + sqrshrun \d0\().8b, v21.8h, #7 + sqrshrun2 \d0\().16b, \s2\().8h, #7 +.endm + + +// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit +// arithmatic can be used to apply filters +const subpel_filters, align=4 + .short 0, 6, 123, 12, 1, 0, 0, 0 + .short 2, 11, 108, 36, 8, 1, 0, 0 + .short 0, 9, 93, 50, 6, 0, 0, 0 + .short 3, 16, 77, 77, 16, 3, 0, 0 + .short 0, 6, 50, 93, 9, 0, 0, 0 + .short 1, 8, 36, 108, 11, 2, 0, 0 + .short 0, 1, 12, 123, 6, 0, 0, 0 +endconst + +function ff_put_vp8_epel16_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + sxtw x4, w4 + sxtw x6, w6 + movrel x17, subpel_filters, -16 + add x6, x17, x6, lsl #4 // y + ld1 {v0.8h}, [x6] +1: + ld1 {v1.1d - v2.1d}, [x2], x3 + ld1 {v3.1d - v4.1d}, [x2], x3 + ld1 {v16.1d - v17.1d}, [x2], x3 + ld1 {v18.1d - v19.1d}, [x2], x3 + ld1 {v20.1d - v21.1d}, [x2], x3 + ld1 {v22.1d - v23.1d}, [x2], x3 + ld1 {v24.1d - v25.1d}, [x2] + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 + vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 + + st1 {v1.1d - v2.1d}, [x0], x1 + st1 {v3.1d - v4.1d}, [x0], x1 + subs x4, x4, #2 + bne 1b + + ret +endfunc + +function ff_put_vp8_epel16_h6_neon, export=1 + sub x2, x2, #2 + sxtw x5, w5 // x + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + add x5, x17, x5, lsl #4 // x + ld1 {v0.8h}, [x5] +1: + ld1 {v1.16b, v2.16b}, [x2], x3 + vp8_epel16_h6 v1, v1, v2 + st1 {v1.16b}, [x0], x1 + + subs w4, w4, #1 + bne 1b + ret +endfunc + + +function ff_put_vp8_epel16_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 // x + add x16, x17, x5, lsl #4 // x + sub sp, sp, #336+16 + ld1 {v0.8h}, [x16] + add x7, sp, #15 + sxtw x4, w4 + add x16, x4, #5 // h + bic x7, x7, #15 +1: + ld1 {v1.16b, v2.16b}, [x2], x3 + vp8_epel16_h6 v1, v1, v2 + st1 {v1.16b}, [x7], #16 + subs x16, x16, #1 + bne 1b + + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v4.8b}, [x7], #32 + ld1 {v16.8b - v19.8b}, [x7], #32 + ld1 {v20.8b - v23.8b}, [x7] + sub x7, x7, #48 + + vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22 + vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23 + trn1 v2.2d, v5.2d, v2.2d + + st1 {v2.16b}, [x0], x1 + subs x4, x4, #1 + bne 2b + + add sp, sp, #336+16 + ret +endfunc + +function ff_put_vp8_epel8_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + sxtw x4, w4 + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #5 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h6 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + bne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v4.8b}, [x7], #32 + ld1 {v5.8b - v7.8b}, [x7] + + sub x7, x7, #16 + + vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 + + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + subs x4, x4, #2 + bne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel8_h4v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #1 + sxtw x4, w4 + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #5 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h4 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + bne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v4.8b}, [x7], #32 + ld1 {v5.8b - v7.8b}, [x7] + + sub x7, x7, #16 + + vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 + + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + subs x4, x4, #2 + bne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel8_h4v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #1 + sxtw x4, w4 + + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #3 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h4 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + bne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v2.8b}, [x7], #16 + ld1 {v3.8b - v5.8b}, [x7] + + vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 + + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + subs x4, x4, #2 + bne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel8_h6v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #2 + sxtw x4, w4 + + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #3 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h6 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + bne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v2.8b}, [x7], #16 + ld1 {v3.8b - v5.8b}, [x7] + + vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 + + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + subs x4, x4, #2 + bne 2b + + add sp, sp, #168+16 + ret +endfunc diff --git a/libavcodec/aarch64/vp9dsp_init.h b/libavcodec/aarch64/vp9dsp_init.h new file mode 100644 index 0000000000..9df1752c62 --- /dev/null +++ b/libavcodec/aarch64/vp9dsp_init.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H +#define AVCODEC_AARCH64_VP9DSP_INIT_H + +#include "libavcodec/vp9dsp.h" + +void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp); +void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp); + +#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */ diff --git a/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c b/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c new file mode 100644 index 0000000000..0fa0d7f8c2 --- /dev/null +++ b/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPP 10 +#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64 +#include "vp9dsp_init_16bpp_aarch64_template.c" diff --git a/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c b/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c new file mode 100644 index 0000000000..dae2232403 --- /dev/null +++ b/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPP 12 +#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64 +#include "vp9dsp_init_16bpp_aarch64_template.c" diff --git a/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c b/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c new file mode 100644 index 0000000000..8dcfdeaaf7 --- /dev/null +++ b/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/internal.h" +#include "libavutil/aarch64/cpu.h" +#include "vp9dsp_init.h" + +#define declare_fpel(type, sz, suffix) \ +void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define decl_mc_func(op, filter, dir, sz, bpp) \ +void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define define_8tap_2d_fn(op, filter, sz, bpp) \ +static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, \ + ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \ + /* We only need h + 7 lines, but the horizontal filter assumes an \ + * even number of rows, so filter h + 8 lines here. */ \ + ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \ + src - 3 * src_stride, src_stride, \ + h + 8, mx, 0); \ + ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \ + temp + 3 * 2 * sz, 2 * sz, \ + h, 0, my); \ +} + +#define decl_filter_funcs(op, dir, sz, bpp) \ + decl_mc_func(op, regular, dir, sz, bpp); \ + decl_mc_func(op, sharp, dir, sz, bpp); \ + decl_mc_func(op, smooth, dir, sz, bpp) + +#define decl_mc_funcs(sz, bpp) \ + decl_filter_funcs(put, h, sz, bpp); \ + decl_filter_funcs(avg, h, sz, bpp); \ + decl_filter_funcs(put, v, sz, bpp); \ + decl_filter_funcs(avg, v, sz, bpp); \ + decl_filter_funcs(put, hv, sz, bpp); \ + decl_filter_funcs(avg, hv, sz, bpp) + +#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64 +#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64 +#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64 + +declare_fpel(copy, 128, ); +declare_fpel(copy, 64, ); +declare_fpel(copy, 32, ); +declare_fpel(copy, 16, ); +declare_fpel(copy, 8, ); +declare_fpel(avg, 64, _16); +declare_fpel(avg, 32, _16); +declare_fpel(avg, 16, _16); +declare_fpel(avg, 8, _16); +declare_fpel(avg, 4, _16); + +decl_mc_funcs(64, BPP); +decl_mc_funcs(32, BPP); +decl_mc_funcs(16, BPP); +decl_mc_funcs(8, BPP); +decl_mc_funcs(4, BPP); + +#define define_8tap_2d_funcs(sz, bpp) \ + define_8tap_2d_fn(put, regular, sz, bpp) \ + define_8tap_2d_fn(put, sharp, sz, bpp) \ + define_8tap_2d_fn(put, smooth, sz, bpp) \ + define_8tap_2d_fn(avg, regular, sz, bpp) \ + define_8tap_2d_fn(avg, sharp, sz, bpp) \ + define_8tap_2d_fn(avg, smooth, sz, bpp) + +define_8tap_2d_funcs(64, BPP) +define_8tap_2d_funcs(32, BPP) +define_8tap_2d_funcs(16, BPP) +define_8tap_2d_funcs(8, BPP) +define_8tap_2d_funcs(4, BPP) + +static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + +#define init_fpel(idx1, idx2, sz, type, suffix) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix + +#define init_copy(idx, sz, suffix) \ + init_fpel(idx, 0, sz, copy, suffix) + +#define init_avg(idx, sz, suffix) \ + init_fpel(idx, 1, sz, avg, suffix) + +#define init_copy_avg(idx, sz1, sz2) \ + init_copy(idx, sz2, _neon); \ + init_avg (idx, sz1, _16_neon) + + if (have_armv8(cpu_flags)) { + init_copy(0, 128, _aarch64); + init_copy(1, 64, _aarch64); + init_copy(2, 32, _aarch64); + } + + if (have_neon(cpu_flags)) { +#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \ + dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon + +#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \ + init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp) + +#define init_mc_funcs_dirs(idx, sz, bpp) \ + init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \ + init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \ + init_mc_funcs(idx, hv, 1, 1, sz, , bpp) + + + init_avg(0, 64, _16_neon); + init_avg(1, 32, _16_neon); + init_avg(2, 16, _16_neon); + init_copy_avg(3, 8, 16); + init_copy_avg(4, 4, 8); + + init_mc_funcs_dirs(0, 64, BPP); + init_mc_funcs_dirs(1, 32, BPP); + init_mc_funcs_dirs(2, 16, BPP); + init_mc_funcs_dirs(3, 8, BPP); + init_mc_funcs_dirs(4, 4, BPP); + } +} + +#define define_itxfm2(type_a, type_b, sz, bpp) \ +void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \ + ptrdiff_t stride, \ + int16_t *_block, int eob) +#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp) + +#define define_itxfm_funcs(sz, bpp) \ + define_itxfm(idct, idct, sz, bpp); \ + define_itxfm(iadst, idct, sz, bpp); \ + define_itxfm(idct, iadst, sz, bpp); \ + define_itxfm(iadst, iadst, sz, bpp) + +define_itxfm_funcs(4, BPP); +define_itxfm_funcs(8, BPP); +define_itxfm_funcs(16, BPP); +define_itxfm(idct, idct, 32, BPP); +define_itxfm(iwht, iwht, 4, BPP); + + +static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { +#define init_itxfm2(tx, sz, bpp) \ + dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \ + dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \ + dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon +#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp) + +#define init_idct2(tx, nm, bpp) \ + dsp->itxfm_add[tx][DCT_DCT] = \ + dsp->itxfm_add[tx][ADST_DCT] = \ + dsp->itxfm_add[tx][DCT_ADST] = \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon +#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp) + + init_itxfm(TX_4X4, 4x4, BPP); + init_itxfm(TX_8X8, 8x8, BPP); + init_itxfm(TX_16X16, 16x16, BPP); + init_idct(TX_32X32, idct_idct_32x32, BPP); + init_idct(4, iwht_iwht_4x4, BPP); + } +} + +#define define_loop_filter(dir, wd, size, bpp) \ +void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H) + +#define define_loop_filters(wd, size, bpp) \ + define_loop_filter(h, wd, size, bpp); \ + define_loop_filter(v, wd, size, bpp) + +define_loop_filters(4, 8, BPP); +define_loop_filters(8, 8, BPP); +define_loop_filters(16, 8, BPP); + +define_loop_filters(16, 16, BPP); + +define_loop_filters(44, 16, BPP); +define_loop_filters(48, 16, BPP); +define_loop_filters(84, 16, BPP); +define_loop_filters(88, 16, BPP); + +static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { +#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \ + dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon + +#define init_lpf_func_16(idx, dir, bpp) \ + dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon + +#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \ + dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon + +#define init_lpf_funcs_8_wd(idx, wd, bpp) \ + init_lpf_func_8(idx, 0, h, wd, bpp); \ + init_lpf_func_8(idx, 1, v, wd, bpp) + +#define init_lpf_funcs_16(bpp) \ + init_lpf_func_16(0, h, bpp); \ + init_lpf_func_16(1, v, bpp) + +#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \ + init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \ + init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp) + +#define init_lpf_funcs_8(bpp) \ + init_lpf_funcs_8_wd(0, 4, bpp); \ + init_lpf_funcs_8_wd(1, 8, bpp); \ + init_lpf_funcs_8_wd(2, 16, bpp) + +#define init_lpf_funcs_mix2(bpp) \ + init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \ + init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \ + init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \ + init_lpf_funcs_mix2_wd(1, 1, 88, bpp) + + init_lpf_funcs_8(BPP); + init_lpf_funcs_16(BPP); + init_lpf_funcs_mix2(BPP); + } +} + +av_cold void INIT_FUNC(VP9DSPContext *dsp) +{ + vp9dsp_mc_init_aarch64(dsp); + vp9dsp_loopfilter_init_aarch64(dsp); + vp9dsp_itxfm_init_aarch64(dsp); +} diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c index 3ce2c1b2b9..4c699759fe 100644 --- a/libavcodec/aarch64/vp9dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c @@ -1,28 +1,30 @@ /* * Copyright (c) 2016 Google Inc. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include <stdint.h> #include "libavutil/attributes.h" +#include "libavutil/internal.h" #include "libavutil/aarch64/cpu.h" -#include "libavcodec/vp9.h" +#include "libavcodec/vp9dsp.h" +#include "vp9dsp_init.h" #define declare_fpel(type, sz) \ void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ @@ -239,8 +241,17 @@ static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp) } } -av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp) +av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp) { + if (bpp == 10) { + ff_vp9dsp_init_10bpp_aarch64(dsp); + return; + } else if (bpp == 12) { + ff_vp9dsp_init_12bpp_aarch64(dsp); + return; + } else if (bpp != 8) + return; + vp9dsp_mc_init_aarch64(dsp); vp9dsp_loopfilter_init_aarch64(dsp); vp9dsp_itxfm_init_aarch64(dsp); diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S new file mode 100644 index 0000000000..68296d9c40 --- /dev/null +++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S @@ -0,0 +1,2017 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +const itxfm4_coeffs, align=4 + .short 11585, 0, 6270, 15137 +iadst4_coeffs: + .short 5283, 15212, 9929, 13377 +endconst + +const iadst8_coeffs, align=4 + .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 +idct_coeffs: + .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 + .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 + .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 + .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 +endconst + +const iadst16_coeffs, align=4 + .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 + .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 +endconst + +.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7 + trn1 \r4\().4s, \r0\().4s, \r1\().4s + trn2 \r5\().4s, \r0\().4s, \r1\().4s + trn1 \r6\().4s, \r2\().4s, \r3\().4s + trn2 \r7\().4s, \r2\().4s, \r3\().4s + trn1 \r0\().2d, \r4\().2d, \r6\().2d + trn2 \r2\().2d, \r4\().2d, \r6\().2d + trn1 \r1\().2d, \r5\().2d, \r7\().2d + trn2 \r3\().2d, \r5\().2d, \r7\().2d +.endm + +// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out +// over two registers. +.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3 + transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3 + transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3 + + // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14 + // while swapping the two 4x4 matrices between each other + + // First step of the 4x4 transpose of r1-r7, into t0-t3 + trn1 \t0\().4s, \r1\().4s, \r3\().4s + trn2 \t1\().4s, \r1\().4s, \r3\().4s + trn1 \t2\().4s, \r5\().4s, \r7\().4s + trn2 \t3\().4s, \r5\().4s, \r7\().4s + + // First step of the 4x4 transpose of r8-r12, into r1-r7 + trn1 \r1\().4s, \r8\().4s, \r10\().4s + trn2 \r3\().4s, \r8\().4s, \r10\().4s + trn1 \r5\().4s, \r12\().4s, \r14\().4s + trn2 \r7\().4s, \r12\().4s, \r14\().4s + + // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12 + trn1 \r8\().2d, \t0\().2d, \t2\().2d + trn2 \r12\().2d, \t0\().2d, \t2\().2d + trn1 \r10\().2d, \t1\().2d, \t3\().2d + trn2 \r14\().2d, \t1\().2d, \t3\().2d + + // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible + trn1 \t0\().2d, \r1\().2d, \r5\().2d + trn2 \r5\().2d, \r1\().2d, \r5\().2d + trn1 \t1\().2d, \r3\().2d, \r7\().2d + trn2 \r7\().2d, \r3\().2d, \r7\().2d + + // Move the outputs of trn1 back in place + mov \r1\().16b, \t0\().16b + mov \r3\().16b, \t1\().16b +.endm + +// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 +// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 +// in/out are .4s registers; this can do with 4 temp registers, but is +// more efficient if 6 temp registers are available. +.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 +.if \neg > 0 + neg \tmp4\().4s, v0.4s +.endif + add \tmp1\().4s, \in1\().4s, \in2\().4s + sub \tmp2\().4s, \in1\().4s, \in2\().4s +.if \neg > 0 + smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0] + smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0] +.else + smull \tmp3\().2d, \tmp1\().2s, v0.s[0] + smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0] +.endif +.ifb \tmp5 + rshrn \out1\().2s, \tmp3\().2d, #14 + rshrn2 \out1\().4s, \tmp4\().2d, #14 + smull \tmp3\().2d, \tmp2\().2s, v0.s[0] + smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0] + rshrn \out2\().2s, \tmp3\().2d, #14 + rshrn2 \out2\().4s, \tmp4\().2d, #14 +.else + smull \tmp5\().2d, \tmp2\().2s, v0.s[0] + smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0] + rshrn \out1\().2s, \tmp3\().2d, #14 + rshrn2 \out1\().4s, \tmp4\().2d, #14 + rshrn \out2\().2s, \tmp5\().2d, #14 + rshrn2 \out2\().4s, \tmp6\().2d, #14 +.endif +.endm + +// Same as dmbutterfly0 above, but treating the input in in2 as zero, +// writing the same output into both out1 and out2. +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 + smull \tmp1\().2d, \in1\().2s, v0.s[0] + smull2 \tmp2\().2d, \in1\().4s, v0.s[0] + rshrn \out1\().2s, \tmp1\().2d, #14 + rshrn2 \out1\().4s, \tmp2\().2d, #14 + rshrn \out2\().2s, \tmp1\().2d, #14 + rshrn2 \out2\().4s, \tmp2\().2d, #14 +.endm + +// out1,out2 = in1 * coef1 - in2 * coef2 +// out3,out4 = in1 * coef2 + in2 * coef1 +// out are 4 x .2d registers, in are 2 x .4s registers +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2 + smull \out1\().2d, \in1\().2s, \coef1 + smull2 \out2\().2d, \in1\().4s, \coef1 + smull \out3\().2d, \in1\().2s, \coef2 + smull2 \out4\().2d, \in1\().4s, \coef2 + smlsl \out1\().2d, \in2\().2s, \coef2 + smlsl2 \out2\().2d, \in2\().4s, \coef2 + smlal \out3\().2d, \in2\().2s, \coef1 + smlal2 \out4\().2d, \in2\().4s, \coef1 +.endm + +// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 +// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 +// inout are 2 x .4s registers +.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0 + dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2 +.if \neg > 0 + neg \tmp3\().2d, \tmp3\().2d + neg \tmp4\().2d, \tmp4\().2d +.endif + rshrn \inout1\().2s, \tmp1\().2d, #14 + rshrn2 \inout1\().4s, \tmp2\().2d, #14 + rshrn \inout2\().2s, \tmp3\().2d, #14 + rshrn2 \inout2\().4s, \tmp4\().2d, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout2 as zero +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().2d, \inout1\().2s, \coef1 + smull2 \tmp2\().2d, \inout1\().4s, \coef1 + smull \tmp3\().2d, \inout1\().2s, \coef2 + smull2 \tmp4\().2d, \inout1\().4s, \coef2 + rshrn \inout1\().2s, \tmp1\().2d, #14 + rshrn2 \inout1\().4s, \tmp2\().2d, #14 + rshrn \inout2\().2s, \tmp3\().2d, #14 + rshrn2 \inout2\().4s, \tmp4\().2d, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout1 as zero +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().2d, \inout2\().2s, \coef2 + smull2 \tmp2\().2d, \inout2\().4s, \coef2 + smull \tmp3\().2d, \inout2\().2s, \coef1 + smull2 \tmp4\().2d, \inout2\().4s, \coef1 + neg \tmp1\().2d, \tmp1\().2d + neg \tmp2\().2d, \tmp2\().2d + rshrn \inout2\().2s, \tmp3\().2d, #14 + rshrn2 \inout2\().4s, \tmp4\().2d, #14 + rshrn \inout1\().2s, \tmp1\().2d, #14 + rshrn2 \inout1\().4s, \tmp2\().2d, #14 +.endm + +.macro dsmull_h out1, out2, in, coef + smull \out1\().2d, \in\().2s, \coef + smull2 \out2\().2d, \in\().4s, \coef +.endm + +.macro drshrn_h out, in1, in2, shift + rshrn \out\().2s, \in1\().2d, \shift + rshrn2 \out\().4s, \in2\().2d, \shift +.endm + + +// out1 = in1 + in2 +// out2 = in1 - in2 +.macro butterfly_4s out1, out2, in1, in2 + add \out1\().4s, \in1\().4s, \in2\().4s + sub \out2\().4s, \in1\().4s, \in2\().4s +.endm + +// out1 = in1 - in2 +// out2 = in1 + in2 +.macro butterfly_4s_r out1, out2, in1, in2 + sub \out1\().4s, \in1\().4s, \in2\().4s + add \out2\().4s, \in1\().4s, \in2\().4s +.endm + +// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 +// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 +// out are 2 x .4s registers, in are 4 x .2d registers +.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 + add \tmp1\().2d, \in1\().2d, \in3\().2d + add \tmp2\().2d, \in2\().2d, \in4\().2d + sub \tmp3\().2d, \in1\().2d, \in3\().2d + sub \tmp4\().2d, \in2\().2d, \in4\().2d + rshrn \out1\().2s, \tmp1\().2d, #14 + rshrn2 \out1\().4s, \tmp2\().2d, #14 + rshrn \out2\().2s, \tmp3\().2d, #14 + rshrn2 \out2\().4s, \tmp4\().2d, #14 +.endm + +.macro iwht4_10 c0, c1, c2, c3 + add \c0\().4s, \c0\().4s, \c1\().4s + sub v17.4s, \c2\().4s, \c3\().4s + sub v16.4s, \c0\().4s, v17.4s + sshr v16.4s, v16.4s, #1 + sub \c2\().4s, v16.4s, \c1\().4s + sub \c1\().4s, v16.4s, \c3\().4s + add \c3\().4s, v17.4s, \c2\().4s + sub \c0\().4s, \c0\().4s, \c1\().4s +.endm + +.macro iwht4_12 c0, c1, c2, c3 + iwht4_10 \c0, \c1, \c2, \c3 +.endm + +.macro idct4_10 c0, c1, c2, c3 + mul v22.4s, \c1\().4s, v0.s[3] + mul v20.4s, \c1\().4s, v0.s[2] + add v16.4s, \c0\().4s, \c2\().4s + sub v17.4s, \c0\().4s, \c2\().4s + mla v22.4s, \c3\().4s, v0.s[2] + mul v18.4s, v16.4s, v0.s[0] + mul v24.4s, v17.4s, v0.s[0] + mls v20.4s, \c3\().4s, v0.s[3] + srshr v22.4s, v22.4s, #14 + srshr v18.4s, v18.4s, #14 + srshr v24.4s, v24.4s, #14 + srshr v20.4s, v20.4s, #14 + add \c0\().4s, v18.4s, v22.4s + sub \c3\().4s, v18.4s, v22.4s + add \c1\().4s, v24.4s, v20.4s + sub \c2\().4s, v24.4s, v20.4s +.endm + +.macro idct4_12 c0, c1, c2, c3 + smull v22.2d, \c1\().2s, v0.s[3] + smull2 v23.2d, \c1\().4s, v0.s[3] + smull v20.2d, \c1\().2s, v0.s[2] + smull2 v21.2d, \c1\().4s, v0.s[2] + add v16.4s, \c0\().4s, \c2\().4s + sub v17.4s, \c0\().4s, \c2\().4s + smlal v22.2d, \c3\().2s, v0.s[2] + smlal2 v23.2d, \c3\().4s, v0.s[2] + smull v18.2d, v16.2s, v0.s[0] + smull2 v19.2d, v16.4s, v0.s[0] + smull v24.2d, v17.2s, v0.s[0] + smull2 v25.2d, v17.4s, v0.s[0] + smlsl v20.2d, \c3\().2s, v0.s[3] + smlsl2 v21.2d, \c3\().4s, v0.s[3] + rshrn v22.2s, v22.2d, #14 + rshrn2 v22.4s, v23.2d, #14 + rshrn v18.2s, v18.2d, #14 + rshrn2 v18.4s, v19.2d, #14 + rshrn v24.2s, v24.2d, #14 + rshrn2 v24.4s, v25.2d, #14 + rshrn v20.2s, v20.2d, #14 + rshrn2 v20.4s, v21.2d, #14 + add \c0\().4s, v18.4s, v22.4s + sub \c3\().4s, v18.4s, v22.4s + add \c1\().4s, v24.4s, v20.4s + sub \c2\().4s, v24.4s, v20.4s +.endm + +.macro iadst4_10 c0, c1, c2, c3 + mul v16.4s, \c0\().4s, v1.s[0] + mla v16.4s, \c2\().4s, v1.s[1] + mla v16.4s, \c3\().4s, v1.s[2] + mul v18.4s, \c0\().4s, v1.s[2] + mls v18.4s, \c2\().4s, v1.s[0] + sub \c0\().4s, \c0\().4s, \c2\().4s + mls v18.4s, \c3\().4s, v1.s[1] + add \c0\().4s, \c0\().4s, \c3\().4s + mul v22.4s, \c1\().4s, v1.s[3] + mul v20.4s, \c0\().4s, v1.s[3] + add v24.4s, v16.4s, v22.4s + add v26.4s, v18.4s, v22.4s + srshr \c0\().4s, v24.4s, #14 + add v16.4s, v16.4s, v18.4s + srshr \c1\().4s, v26.4s, #14 + sub v16.4s, v16.4s, v22.4s + srshr \c2\().4s, v20.4s, #14 + srshr \c3\().4s, v16.4s, #14 +.endm + +.macro iadst4_12 c0, c1, c2, c3 + smull v16.2d, \c0\().2s, v1.s[0] + smull2 v17.2d, \c0\().4s, v1.s[0] + smlal v16.2d, \c2\().2s, v1.s[1] + smlal2 v17.2d, \c2\().4s, v1.s[1] + smlal v16.2d, \c3\().2s, v1.s[2] + smlal2 v17.2d, \c3\().4s, v1.s[2] + smull v18.2d, \c0\().2s, v1.s[2] + smull2 v19.2d, \c0\().4s, v1.s[2] + smlsl v18.2d, \c2\().2s, v1.s[0] + smlsl2 v19.2d, \c2\().4s, v1.s[0] + sub \c0\().4s, \c0\().4s, \c2\().4s + smlsl v18.2d, \c3\().2s, v1.s[1] + smlsl2 v19.2d, \c3\().4s, v1.s[1] + add \c0\().4s, \c0\().4s, \c3\().4s + smull v22.2d, \c1\().2s, v1.s[3] + smull2 v23.2d, \c1\().4s, v1.s[3] + smull v20.2d, \c0\().2s, v1.s[3] + smull2 v21.2d, \c0\().4s, v1.s[3] + add v24.2d, v16.2d, v22.2d + add v25.2d, v17.2d, v23.2d + add v26.2d, v18.2d, v22.2d + add v27.2d, v19.2d, v23.2d + rshrn \c0\().2s, v24.2d, #14 + rshrn2 \c0\().4s, v25.2d, #14 + add v16.2d, v16.2d, v18.2d + add v17.2d, v17.2d, v19.2d + rshrn \c1\().2s, v26.2d, #14 + rshrn2 \c1\().4s, v27.2d, #14 + sub v16.2d, v16.2d, v22.2d + sub v17.2d, v17.2d, v23.2d + rshrn \c2\().2s, v20.2d, #14 + rshrn2 \c2\().4s, v21.2d, #14 + rshrn \c3\().2s, v16.2d, #14 + rshrn2 \c3\().4s, v17.2d, #14 +.endm + +// The public functions in this file have got the following signature: +// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +.macro itxfm_func4x4 txfm1, txfm2, bpp +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1 +.ifc \txfm1,\txfm2 +.ifc \txfm1,idct + movrel x4, itxfm4_coeffs + ld1 {v0.4h}, [x4] + sxtl v0.4s, v0.4h +.endif +.ifc \txfm1,iadst + movrel x4, iadst4_coeffs + ld1 {v0.d}[1], [x4] + sxtl2 v1.4s, v0.8h +.endif +.else + movrel x4, itxfm4_coeffs + ld1 {v0.8h}, [x4] + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h +.endif + + movi v30.4s, #0 + movi v31.4s, #0 +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.ne 1f + // DC-only for idct/idct + ld1 {v2.s}[0], [x2] + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + st1 {v31.s}[0], [x2] + dup v4.4s, v2.s[0] + mov v5.16b, v4.16b + mov v6.16b, v4.16b + mov v7.16b, v4.16b + b 2f +.endif + +1: + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2] + st1 {v30.4s,v31.4s}, [x2], #32 + +.ifc \txfm1,iwht + sshr v4.4s, v4.4s, #2 + sshr v5.4s, v5.4s, #2 + sshr v6.4s, v6.4s, #2 + sshr v7.4s, v7.4s, #2 +.endif + + \txfm1\()4_\bpp v4, v5, v6, v7 + + st1 {v30.4s,v31.4s}, [x2], #32 + // Transpose 4x4 with 32 bit elements + transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19 + + \txfm2\()4_\bpp v4, v5, v6, v7 +2: + mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 + ld1 {v0.4h}, [x0], x1 + ld1 {v1.4h}, [x0], x1 +.ifnc \txfm1,iwht + srshr v4.4s, v4.4s, #4 + srshr v5.4s, v5.4s, #4 + srshr v6.4s, v6.4s, #4 + srshr v7.4s, v7.4s, #4 +.endif + uaddw v4.4s, v4.4s, v0.4h + uaddw v5.4s, v5.4s, v1.4h + ld1 {v2.4h}, [x0], x1 + ld1 {v3.4h}, [x0], x1 + sqxtun v0.4h, v4.4s + sqxtun2 v0.8h, v5.4s + sub x0, x0, x1, lsl #2 + + uaddw v6.4s, v6.4s, v2.4h + umin v0.8h, v0.8h, v31.8h + uaddw v7.4s, v7.4s, v3.4h + st1 {v0.4h}, [x0], x1 + sqxtun v2.4h, v6.4s + sqxtun2 v2.8h, v7.4s + umin v2.8h, v2.8h, v31.8h + + st1 {v0.d}[1], [x0], x1 + st1 {v2.4h}, [x0], x1 + st1 {v2.d}[1], [x0], x1 + + ret +endfunc +.endm + +.macro itxfm_funcs4x4 bpp +itxfm_func4x4 idct, idct, \bpp +itxfm_func4x4 iadst, idct, \bpp +itxfm_func4x4 idct, iadst, \bpp +itxfm_func4x4 iadst, iadst, \bpp +itxfm_func4x4 iwht, iwht, \bpp +.endm + +itxfm_funcs4x4 10 +itxfm_funcs4x4 12 + +function idct8x8_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + + movi v1.4h, #0 + sxtl v0.4s, v0.4h + + ld1 {v2.s}[0], [x2] + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + st1 {v1.s}[0], [x2] + dup v2.4s, v2.s[0] + + srshr v2.4s, v2.4s, #5 + + mov x4, #8 + mov x3, x0 + dup v31.8h, w5 +1: + // Loop to add the constant from v2 into all 8x8 outputs + subs x4, x4, #2 + ld1 {v3.8h}, [x0], x1 + ld1 {v4.8h}, [x0], x1 + uaddw v16.4s, v2.4s, v3.4h + uaddw2 v17.4s, v2.4s, v3.8h + uaddw v18.4s, v2.4s, v4.4h + uaddw2 v19.4s, v2.4s, v4.8h + sqxtun v3.4h, v16.4s + sqxtun2 v3.8h, v17.4s + sqxtun v4.4h, v18.4s + sqxtun2 v4.8h, v19.4s + umin v3.8h, v3.8h, v31.8h + umin v4.8h, v4.8h, v31.8h + st1 {v3.8h}, [x3], x1 + st1 {v4.8h}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5 + dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a + dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3 // r2 = t2a, r6 = t3a + dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3 // r1 = t4a, r7 = t7a + dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3 // r5 = t5a, r3 = t6a + + butterfly_4s \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3 + butterfly_4s \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a + butterfly_4s \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a + butterfly_4s \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2 + + dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5 + + butterfly_4s \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6] + butterfly_4s \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7] + butterfly_4s \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5] + butterfly_4s \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4] +.endm + +.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5 + dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0] // t2,t3 = t1a, t0,t1 = t0a + dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0] // r0,r7 = t5a, t4,t5 = t4a + + dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4 + dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5 + + dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2] // t4,t5 = t3a, t2,t3 = t2a + dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2] // r2,r5 = t7a, r0,r7 = t6a + + dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6 + dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7 + + butterfly_4s \r7, \r4, \r4, \r0 // r7 = -out[7], r4 = t3 + neg \r7\().4s, \r7\().4s // r7 = out[7] + butterfly_4s \r0, \r1, \r3, \r1 // r0 = out[0], r1 = t2 + + dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3] // r2,r3 = t5a, t3,t5 = t4a + dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2] // t0,t1 = t6a, r5,r6 = t7a + + dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6], t2 = t7 + + dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2 // r3 = -out[3], r4 = out[4] + neg \r3\().4s, \r3\().4s // r3 = out[3] + + dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6 + neg \r1\().4s, \r1\().4s // r1 = out[1] + + dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5 // r2 = out[2], r5 = -out[5] + neg \r5\().4s, \r5\().4s // r5 = out[5] +.endm + + +.macro itxfm_func8x8 txfm1, txfm2 +function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.eq idct8x8_dc_add_neon +.endif + // The iadst also uses a few coefficients from + // idct, so those always need to be loaded. +.ifc \txfm1\()_\txfm2,idct_idct + movrel x4, idct_coeffs +.else + movrel x4, iadst8_coeffs + ld1 {v1.8h}, [x4], #16 + stp d8, d9, [sp, #-0x10]! + sxtl2 v3.4s, v1.8h + sxtl v2.4s, v1.4h +.endif + ld1 {v0.8h}, [x4] + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h + + movi v4.4s, #0 + movi v5.4s, #0 + movi v6.4s, #0 + movi v7.4s, #0 + +1: + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64 + ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64 + ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64 + ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 + sub x2, x2, #256 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 + +.ifc \txfm1\()_\txfm2,idct_idct + idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7 + idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7 +.else + \txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9 + \txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9 +.endif + + // Transpose 8x8 with 16 bit elements + transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7 + +.ifc \txfm1\()_\txfm2,idct_idct + idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7 + idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7 +.else + \txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9 + \txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9 +.endif +2: + mov x3, x0 + // Add into the destination + ld1 {v0.8h}, [x0], x1 + srshr v16.4s, v16.4s, #5 + srshr v17.4s, v17.4s, #5 + ld1 {v1.8h}, [x0], x1 + srshr v18.4s, v18.4s, #5 + srshr v19.4s, v19.4s, #5 + ld1 {v2.8h}, [x0], x1 + srshr v20.4s, v20.4s, #5 + srshr v21.4s, v21.4s, #5 + uaddw v16.4s, v16.4s, v0.4h + uaddw2 v17.4s, v17.4s, v0.8h + ld1 {v3.8h}, [x0], x1 + srshr v22.4s, v22.4s, #5 + srshr v23.4s, v23.4s, #5 + uaddw v18.4s, v18.4s, v1.4h + uaddw2 v19.4s, v19.4s, v1.8h + ld1 {v4.8h}, [x0], x1 + srshr v24.4s, v24.4s, #5 + srshr v25.4s, v25.4s, #5 + uaddw v20.4s, v20.4s, v2.4h + uaddw2 v21.4s, v21.4s, v2.8h + sqxtun v0.4h, v16.4s + sqxtun2 v0.8h, v17.4s + dup v16.8h, w5 + ld1 {v5.8h}, [x0], x1 + srshr v26.4s, v26.4s, #5 + srshr v27.4s, v27.4s, #5 + uaddw v22.4s, v22.4s, v3.4h + uaddw2 v23.4s, v23.4s, v3.8h + sqxtun v1.4h, v18.4s + sqxtun2 v1.8h, v19.4s + umin v0.8h, v0.8h, v16.8h + ld1 {v6.8h}, [x0], x1 + srshr v28.4s, v28.4s, #5 + srshr v29.4s, v29.4s, #5 + uaddw v24.4s, v24.4s, v4.4h + uaddw2 v25.4s, v25.4s, v4.8h + sqxtun v2.4h, v20.4s + sqxtun2 v2.8h, v21.4s + umin v1.8h, v1.8h, v16.8h + ld1 {v7.8h}, [x0], x1 + srshr v30.4s, v30.4s, #5 + srshr v31.4s, v31.4s, #5 + uaddw v26.4s, v26.4s, v5.4h + uaddw2 v27.4s, v27.4s, v5.8h + sqxtun v3.4h, v22.4s + sqxtun2 v3.8h, v23.4s + umin v2.8h, v2.8h, v16.8h + + st1 {v0.8h}, [x3], x1 + uaddw v28.4s, v28.4s, v6.4h + uaddw2 v29.4s, v29.4s, v6.8h + st1 {v1.8h}, [x3], x1 + sqxtun v4.4h, v24.4s + sqxtun2 v4.8h, v25.4s + umin v3.8h, v3.8h, v16.8h + st1 {v2.8h}, [x3], x1 + uaddw v30.4s, v30.4s, v7.4h + uaddw2 v31.4s, v31.4s, v7.8h + st1 {v3.8h}, [x3], x1 + sqxtun v5.4h, v26.4s + sqxtun2 v5.8h, v27.4s + umin v4.8h, v4.8h, v16.8h + st1 {v4.8h}, [x3], x1 + sqxtun v6.4h, v28.4s + sqxtun2 v6.8h, v29.4s + umin v5.8h, v5.8h, v16.8h + st1 {v5.8h}, [x3], x1 + sqxtun v7.4h, v30.4s + sqxtun2 v7.8h, v31.4s + umin v6.8h, v6.8h, v16.8h + + st1 {v6.8h}, [x3], x1 + umin v7.8h, v7.8h, v16.8h + st1 {v7.8h}, [x3], x1 + +.ifnc \txfm1\()_\txfm2,idct_idct + ldp d8, d9, [sp], 0x10 +.endif + ret +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1 + mov x5, #0x03ff + b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1 + mov x5, #0x0fff + b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon +endfunc +.endm + +itxfm_func8x8 idct, idct +itxfm_func8x8 iadst, idct +itxfm_func8x8 idct, iadst +itxfm_func8x8 iadst, iadst + + +function idct16x16_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + sxtl v0.4s, v0.4h + + movi v1.4h, #0 + + ld1 {v2.s}[0], [x2] + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + st1 {v1.s}[0], [x2] + dup v2.4s, v2.s[0] + + srshr v0.4s, v2.4s, #6 + + mov x3, x0 + mov x4, #16 + dup v31.8h, w13 +1: + // Loop to add the constant from v2 into all 16x16 outputs + subs x4, x4, #2 + ld1 {v1.8h,v2.8h}, [x0], x1 + uaddw v16.4s, v0.4s, v1.4h + uaddw2 v17.4s, v0.4s, v1.8h + ld1 {v3.8h,v4.8h}, [x0], x1 + uaddw v18.4s, v0.4s, v2.4h + uaddw2 v19.4s, v0.4s, v2.8h + uaddw v20.4s, v0.4s, v3.4h + uaddw2 v21.4s, v0.4s, v3.8h + uaddw v22.4s, v0.4s, v4.4h + uaddw2 v23.4s, v0.4s, v4.8h + sqxtun v1.4h, v16.4s + sqxtun2 v1.8h, v17.4s + sqxtun v2.4h, v18.4s + sqxtun2 v2.8h, v19.4s + sqxtun v3.4h, v20.4s + sqxtun2 v3.8h, v21.4s + sqxtun v4.4h, v22.4s + sqxtun2 v4.8h, v23.4s + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + st1 {v1.8h,v2.8h}, [x3], x1 + umin v3.8h, v3.8h, v31.8h + umin v4.8h, v4.8h, v31.8h + st1 {v3.8h,v4.8h}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct16_end + butterfly_4s v18, v7, v4, v7 // v18 = t0a, v7 = t7a + butterfly_4s v19, v22, v5, v22 // v19 = t1a, v22 = t6 + butterfly_4s v4, v26, v20, v26 // v4 = t2a, v26 = t5 + butterfly_4s v5, v6, v28, v6 // v5 = t3a, v6 = t4 + butterfly_4s v20, v28, v16, v24 // v20 = t8a, v28 = t11a + butterfly_4s v24, v21, v23, v21 // v24 = t9, v21 = t10 + butterfly_4s v23, v27, v25, v27 // v23 = t14, v27 = t13 + butterfly_4s v25, v29, v29, v17 // v25 = t15a, v29 = t12a + + dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31 // v8 = t13a, v9 = t10a + dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11 + + butterfly_4s v16, v31, v18, v25 // v16 = out[0], v31 = out[15] + butterfly_4s v17, v30, v19, v23 // v17 = out[1], v30 = out[14] + butterfly_4s_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6] + butterfly_4s v23, v24, v7, v20 // v23 = out[7], v24 = out[8] + butterfly_4s v18, v29, v4, v8 // v18 = out[2], v29 = out[13] + butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12] + butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11] + butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10] + ret +.endm + +function idct16 + dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a + dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a + dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a + dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a + dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a + dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a + dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a + dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a + + butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_half + dmbutterfly0_h v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a + dmbutterfly_h1 v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a + dmbutterfly_h1 v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a + dmbutterfly_h2 v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a + dmbutterfly_h1 v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a + dmbutterfly_h2 v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a + dmbutterfly_h1 v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a + dmbutterfly_h2 v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a + + butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_quarter + dsmull_h v24, v25, v19, v3.s[3] + dsmull_h v4, v5, v17, v2.s[0] + dsmull_h v7, v6, v18, v1.s[1] + dsmull_h v30, v31, v18, v1.s[0] + neg v24.2d, v24.2d + neg v25.2d, v25.2d + dsmull_h v29, v28, v17, v2.s[1] + dsmull_h v26, v27, v19, v3.s[2] + dsmull_h v22, v23, v16, v0.s[0] + drshrn_h v24, v24, v25, #14 + drshrn_h v16, v4, v5, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v6, v30, v31, #14 + drshrn_h v29, v29, v28, #14 + drshrn_h v17, v26, v27, #14 + drshrn_h v28, v22, v23, #14 + + dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3] + dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3] + neg v22.2d, v22.2d + neg v23.2d, v23.2d + drshrn_h v27, v20, v21, #14 + drshrn_h v21, v22, v23, #14 + drshrn_h v23, v18, v19, #14 + drshrn_h v25, v30, v31, #14 + mov v4.16b, v28.16b + mov v5.16b, v28.16b + dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 + mov v20.16b, v28.16b + idct16_end +endfunc + +function iadst16 + ld1 {v0.8h,v1.8h}, [x11] + sxtl v2.4s, v1.4h + sxtl2 v3.4s, v1.8h + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h + + dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0] // v6,v7 = t1, v4,v5 = t0 + dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0] // v10,v11 = t9, v8,v9 = t8 + dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a + dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2] // v14,v15 = t3, v12,v13 = t2 + dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a + + dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2] // v6,v7 = t11, v4,v5 = t10 + dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a + dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0] // v10,v11 = t5, v8,v9 = t4 + dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a + + dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0] // v14,v15 = t13, v12,v13 = t12 + dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a + dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2] // v6,v7 = t7, v4,v5 = t6 + dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a + + dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2] // v10,v11 = t15, v8,v9 = t14 + ld1 {v0.8h}, [x10] + dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h + dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1] // v14,v15 = t9, v12,v13 = t8 + dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a + + dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0] // v4,v5 = t12, v6,v7 = t13 + dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a + dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3] // v10,v11 = t11, v8,v9 = t10 + butterfly_4s_r v4, v27, v16, v27 // v4 = t4, v27 = t0 + dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a + + dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2] // v12,v13 = t14, v14,v15 = t15 + butterfly_4s_r v5, v20, v31, v20 // v5 = t5, v20 = t1 + dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a + dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a + + butterfly_4s_r v6, v25, v18, v25 // v6 = t6, v25 = t2 + butterfly_4s_r v7, v22, v29, v22 // v7 = t7, v22 = t3 + + dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3] // v10,v11 = t13, v8,v9 = t12 + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2] // v12,v13 = t14, v14,v15 = t15 + + dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a + dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a + neg v29.4s, v29.4s // v29 = out[13] + + dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3] // v10,v11 = t5a, v8,v9 = t4a + dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2] // v12,v13 = t6a, v14,v15 = t7a + + butterfly_4s v2, v6, v27, v25 // v2 = out[0], v6 = t2a + butterfly_4s v3, v7, v23, v21 // v3 =-out[1], v7 = t10 + + dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6 + neg v19.4s, v19.4s // v19 = out[3] + dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7 + + butterfly_4s v5, v8, v20, v22 // v5 =-out[15],v8 = t3a + butterfly_4s v4, v9, v24, v26 // v4 = out[14],v9 = t11 + + dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8] + dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10] + dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11] + dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9] + + neg v31.4s, v5.4s // v31 = out[15] + neg v17.4s, v3.4s // v17 = out[1] + + mov v16.16b, v2.16b + mov v30.16b, v4.16b + ret +endfunc + +// Helper macros; we can't use these expressions directly within +// e.g. .irp due to the extra concatenation \(). Therefore wrap +// them in macros to allow using .irp below. +.macro load i, src, inc + ld1 {v\i\().4s}, [\src], \inc +.endm +.macro store i, dst, inc + st1 {v\i\().4s}, [\dst], \inc +.endm +.macro movi_v i, size, imm + movi v\i\()\size, \imm +.endm +.macro load_clear i, src, inc + ld1 {v\i\().4s}, [\src] + st1 {v4.4s}, [\src], \inc +.endm + +.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7 + srshr \coef0, \coef0, #6 + ld1 {v4.4h}, [x0], x1 + srshr \coef1, \coef1, #6 + ld1 {v4.d}[1], [x3], x1 + srshr \coef2, \coef2, #6 + ld1 {v5.4h}, [x0], x1 + srshr \coef3, \coef3, #6 + uaddw \coef0, \coef0, v4.4h + ld1 {v5.d}[1], [x3], x1 + srshr \coef4, \coef4, #6 + uaddw2 \coef1, \coef1, v4.8h + ld1 {v6.4h}, [x0], x1 + srshr \coef5, \coef5, #6 + uaddw \coef2, \coef2, v5.4h + ld1 {v6.d}[1], [x3], x1 + sqxtun v4.4h, \coef0 + srshr \coef6, \coef6, #6 + uaddw2 \coef3, \coef3, v5.8h + ld1 {v7.4h}, [x0], x1 + sqxtun2 v4.8h, \coef1 + srshr \coef7, \coef7, #6 + uaddw \coef4, \coef4, v6.4h + ld1 {v7.d}[1], [x3], x1 + umin v4.8h, v4.8h, v8.8h + sub x0, x0, x1, lsl #2 + sub x3, x3, x1, lsl #2 + sqxtun v5.4h, \coef2 + uaddw2 \coef5, \coef5, v6.8h + st1 {v4.4h}, [x0], x1 + sqxtun2 v5.8h, \coef3 + uaddw \coef6, \coef6, v7.4h + st1 {v4.d}[1], [x3], x1 + umin v5.8h, v5.8h, v8.8h + sqxtun v6.4h, \coef4 + uaddw2 \coef7, \coef7, v7.8h + st1 {v5.4h}, [x0], x1 + sqxtun2 v6.8h, \coef5 + st1 {v5.d}[1], [x3], x1 + umin v6.8h, v6.8h, v8.8h + sqxtun v7.4h, \coef6 + st1 {v6.4h}, [x0], x1 + sqxtun2 v7.8h, \coef7 + st1 {v6.d}[1], [x3], x1 + umin v7.8h, v7.8h, v8.8h + st1 {v7.4h}, [x0], x1 + st1 {v7.d}[1], [x3], x1 +.endm + +// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, +// transpose into a horizontal 16x4 slice and store. +// x0 = dst (temp buffer) +// x1 = slice offset +// x2 = src +// x9 = input stride +.macro itxfm16_1d_funcs txfm +function \txfm\()16_1d_4x16_pass1_neon + mov x14, x30 + + movi v4.4s, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr + + bl \txfm\()16 + + // Do four 4x4 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 + // contain the four transposed 4x4 blocks. + transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 + transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 + transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 + + // Store the transposed 4x4 blocks horizontally. + cmp x1, #12 + b.eq 1f +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 + store \i, x0, #16 +.endr + br x14 +1: + // Special case: For the last input column (x1 == 12), + // which would be stored as the last row in the temp buffer, + // don't store the first 4x4 block, but keep it in registers + // for the first slice of the second pass (where it is the + // last 4x4 block). + add x0, x0, #16 + st1 {v20.4s}, [x0], #16 + st1 {v24.4s}, [x0], #16 + st1 {v28.4s}, [x0], #16 + add x0, x0, #16 + st1 {v21.4s}, [x0], #16 + st1 {v25.4s}, [x0], #16 + st1 {v29.4s}, [x0], #16 + add x0, x0, #16 + st1 {v22.4s}, [x0], #16 + st1 {v26.4s}, [x0], #16 + st1 {v30.4s}, [x0], #16 + add x0, x0, #16 + st1 {v23.4s}, [x0], #16 + st1 {v27.4s}, [x0], #16 + st1 {v31.4s}, [x0], #16 + + mov v28.16b, v16.16b + mov v29.16b, v17.16b + mov v30.16b, v18.16b + mov v31.16b, v19.16b + br x14 +endfunc + +// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, +// load the destination pixels (from a similar 4x16 slice), add and store back. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x3 = slice offset +// x9 = temp buffer stride +function \txfm\()16_1d_4x16_pass2_neon + mov x14, x30 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 + load \i, x2, x9 +.endr + cbz x3, 1f +.irp i, 28, 29, 30, 31 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl \txfm\()16 + + dup v8.8h, w13 + load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + + br x14 +endfunc +.endm + +itxfm16_1d_funcs idct +itxfm16_1d_funcs iadst + +// This is the minimum eob value for each subpartition, in increments of 4 +const min_eob_idct_idct_16, align=4 + .short 0, 10, 38, 89 +endconst + +.macro itxfm_func16x16 txfm1, txfm2 +function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.eq idct16x16_dc_add_neon +.endif + mov x15, x30 + // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9. +.ifnc \txfm1\()_\txfm2,idct_idct + stp d14, d15, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! +.endif + stp d8, d9, [sp, #-0x10]! + + sub sp, sp, #1024 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + + movrel x10, idct_coeffs +.ifnc \txfm1\()_\txfm2,idct_idct + movrel x11, iadst16_coeffs +.endif +.ifc \txfm1,idct + ld1 {v0.8h,v1.8h}, [x10] + sxtl v2.4s, v1.4h + sxtl2 v3.4s, v1.8h + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h +.endif + mov x9, #64 + +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #10 + b.le idct16x16_quarter_add_16_neon + cmp w3, #38 + b.le idct16x16_half_add_16_neon + + movrel x12, min_eob_idct_idct_16, 2 +.endif + +.irp i, 0, 4, 8, 12 + add x0, sp, #(\i*64) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i > 0 + ldrh w1, [x12], #2 + cmp w3, w1 + mov x1, #(16 - \i)/4 + b.le 1f +.endif +.endif + mov x1, #\i + add x2, x6, #(\i*4) + bl \txfm1\()16_1d_4x16_pass1_neon +.endr +.ifc \txfm1\()_\txfm2,iadst_idct + ld1 {v0.8h,v1.8h}, [x10] + sxtl v2.4s, v1.4h + sxtl2 v3.4s, v1.8h + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h +.endif + +.ifc \txfm1\()_\txfm2,idct_idct + b 3f +1: + // Set v28-v31 to zero, for the in-register passthrough of + // coefficients to pass 2. + movi v28.4s, #0 + movi v29.4s, #0 + movi v30.4s, #0 + movi v31.4s, #0 +2: + subs x1, x1, #1 +.rept 4 + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9 +.endr + b.ne 2b +3: +.endif + +.irp i, 0, 4, 8, 12 + add x0, x4, #(\i*2) + mov x1, x5 + add x2, sp, #(\i*4) + mov x3, #\i + bl \txfm2\()16_1d_4x16_pass2_neon +.endr + + add sp, sp, #1024 + ldp d8, d9, [sp], 0x10 +.ifnc \txfm1\()_\txfm2,idct_idct + ldp d10, d11, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d14, d15, [sp], 0x10 +.endif + br x15 +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1 + mov x13, #0x03ff + b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1 + mov x13, #0x0fff + b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon +endfunc +.endm + +itxfm_func16x16 idct, idct +itxfm_func16x16 iadst, idct +itxfm_func16x16 idct, iadst +itxfm_func16x16 iadst, iadst + +function idct16_1d_4x16_pass1_quarter_neon + mov x14, x30 + + movi v4.4s, #0 +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr + + bl idct16_quarter + + // Do four 4x4 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 + // contain the four transposed 4x4 blocks. + transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 + transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 + transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 + + // Store the transposed 4x4 blocks horizontally. + // The first 4x4 block is kept in registers for the second pass, + // store the rest in the temp buffer. + add x0, x0, #16 + st1 {v20.4s}, [x0], #16 + st1 {v24.4s}, [x0], #16 + st1 {v28.4s}, [x0], #16 + add x0, x0, #16 + st1 {v21.4s}, [x0], #16 + st1 {v25.4s}, [x0], #16 + st1 {v29.4s}, [x0], #16 + add x0, x0, #16 + st1 {v22.4s}, [x0], #16 + st1 {v26.4s}, [x0], #16 + st1 {v30.4s}, [x0], #16 + add x0, x0, #16 + st1 {v23.4s}, [x0], #16 + st1 {v27.4s}, [x0], #16 + st1 {v31.4s}, [x0], #16 + br x14 +endfunc + +function idct16_1d_4x16_pass2_quarter_neon + mov x14, x30 + + // Only load the top 4 lines, and only do it for the later slices. + // For the first slice, d16-d19 is kept in registers from the first pass. + cbz x3, 1f +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_quarter + + dup v8.8h, w13 + load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + + br x14 +endfunc + +function idct16_1d_4x16_pass1_half_neon + mov x14, x30 + + movi v4.4s, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr + + bl idct16_half + + // Do four 4x4 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 + // contain the four transposed 4x4 blocks. + transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 + transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 + transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 + + // Store the transposed 4x4 blocks horizontally. + cmp x1, #4 + b.eq 1f +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 + store \i, x0, #16 +.endr + br x14 +1: + // Special case: For the second input column (r1 == 4), + // which would be stored as the second row in the temp buffer, + // don't store the first 4x4 block, but keep it in registers + // for the first slice of the second pass (where it is the + // second 4x4 block). + add x0, x0, #16 + st1 {v20.4s}, [x0], #16 + st1 {v24.4s}, [x0], #16 + st1 {v28.4s}, [x0], #16 + add x0, x0, #16 + st1 {v21.4s}, [x0], #16 + st1 {v25.4s}, [x0], #16 + st1 {v29.4s}, [x0], #16 + add x0, x0, #16 + st1 {v22.4s}, [x0], #16 + st1 {v26.4s}, [x0], #16 + st1 {v30.4s}, [x0], #16 + add x0, x0, #16 + st1 {v23.4s}, [x0], #16 + st1 {v27.4s}, [x0], #16 + st1 {v31.4s}, [x0], #16 + + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v22.16b, v18.16b + mov v23.16b, v19.16b + br x14 +endfunc + +function idct16_1d_4x16_pass2_half_neon + mov x14, x30 + +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + cbz x3, 1f +.irp i, 20, 21, 22, 23 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_half + + dup v8.8h, w13 + load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + + br x14 +endfunc + +.macro idct16_partial size +function idct16x16_\size\()_add_16_neon + add x0, sp, #(0*64) + mov x1, #0 + add x2, x6, #(0*4) + bl idct16_1d_4x16_pass1_\size\()_neon +.ifc \size,half + add x0, sp, #(4*64) + mov x1, #4 + add x2, x6, #(4*4) + bl idct16_1d_4x16_pass1_\size\()_neon +.endif + +.irp i, 0, 4, 8, 12 + add x0, x4, #(\i*2) + mov x1, x5 + add x2, sp, #(\i*4) + mov x3, #\i + bl idct16_1d_4x16_pass2_\size\()_neon +.endr + + add sp, sp, #1024 + ldp d8, d9, [sp], 0x10 + br x15 +endfunc +.endm + +idct16_partial quarter +idct16_partial half + +function idct32x32_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + sxtl v0.4s, v0.4h + + movi v1.4h, #0 + + ld1 {v2.s}[0], [x2] + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + st1 {v1.s}[0], [x2] + dup v2.4s, v2.s[0] + + srshr v0.4s, v2.4s, #6 + + mov x3, x0 + mov x4, #32 + sub x1, x1, #32 + dup v31.8h, w13 +1: + // Loop to add the constant v0 into all 32x32 outputs + subs x4, x4, #1 + ld1 {v1.8h,v2.8h}, [x0], #32 + uaddw v16.4s, v0.4s, v1.4h + uaddw2 v17.4s, v0.4s, v1.8h + ld1 {v3.8h,v4.8h}, [x0], x1 + uaddw v18.4s, v0.4s, v2.4h + uaddw2 v19.4s, v0.4s, v2.8h + uaddw v20.4s, v0.4s, v3.4h + uaddw2 v21.4s, v0.4s, v3.8h + uaddw v22.4s, v0.4s, v4.4h + uaddw2 v23.4s, v0.4s, v4.8h + sqxtun v1.4h, v16.4s + sqxtun2 v1.8h, v17.4s + sqxtun v2.4h, v18.4s + sqxtun2 v2.8h, v19.4s + sqxtun v3.4h, v20.4s + sqxtun2 v3.8h, v21.4s + sqxtun v4.4h, v22.4s + sqxtun2 v4.8h, v23.4s + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + st1 {v1.8h,v2.8h}, [x3], #32 + umin v3.8h, v3.8h, v31.8h + umin v4.8h, v4.8h, v31.8h + st1 {v3.8h,v4.8h}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct32_end + butterfly_4s v16, v5, v4, v5 // v16 = t16a, v5 = t19a + butterfly_4s v17, v20, v23, v20 // v17 = t17, v20 = t18 + butterfly_4s v18, v6, v7, v6 // v18 = t23a, v6 = t20a + butterfly_4s v19, v21, v22, v21 // v19 = t22, v21 = t21 + butterfly_4s v4, v28, v28, v30 // v4 = t24a, v28 = t27a + butterfly_4s v23, v26, v25, v26 // v23 = t25, v26 = t26 + butterfly_4s v7, v8, v29, v31 // v7 = t31a, v3 = t28a + butterfly_4s v22, v27, v24, v27 // v22 = t30, v27 = t29 + + dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a + dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 + dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 + dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a + + butterfly_4s v31, v24, v7, v4 // v31 = t31, v24 = t24 + butterfly_4s v30, v25, v22, v23 // v30 = t30a, v25 = t25a + butterfly_4s_r v23, v16, v16, v18 // v23 = t23, v16 = t16 + butterfly_4s_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a + butterfly_4s v18, v21, v27, v21 // v18 = t18, v21 = t21 + butterfly_4s_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a + butterfly_4s v29, v26, v20, v26 // v29 = t29, v26 = t26 + butterfly_4s v19, v20, v8, v6 // v19 = t19a, v20 = t20 + + dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27, v20 = t20 + dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a + dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22 + dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a + ret +.endm + +function idct32_odd + dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_half + dmbutterfly_h1 v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly_h2 v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly_h1 v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly_h2 v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly_h1 v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly_h2 v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly_h1 v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly_h2 v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_quarter + dsmull_h v4, v5, v16, v10.s[0] + dsmull_h v28, v29, v19, v11.s[3] + dsmull_h v30, v31, v16, v10.s[1] + dsmull_h v22, v23, v17, v13.s[2] + dsmull_h v7, v6, v17, v13.s[3] + dsmull_h v26, v27, v19, v11.s[2] + dsmull_h v20, v21, v18, v12.s[0] + dsmull_h v24, v25, v18, v12.s[1] + + neg v28.2d, v28.2d + neg v29.2d, v29.2d + neg v7.2d, v7.2d + neg v6.2d, v6.2d + + drshrn_h v4, v4, v5, #14 + drshrn_h v5, v28, v29, #14 + drshrn_h v29, v30, v31, #14 + drshrn_h v28, v22, v23, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v31, v26, v27, #14 + drshrn_h v6, v20, v21, #14 + drshrn_h v30, v24, v25, #14 + + dmbutterfly_l v16, v17, v18, v19, v29, v4, v1.s[0], v1.s[1] + dmbutterfly_l v27, v26, v20, v21, v31, v5, v1.s[0], v1.s[1] + drshrn_h v23, v16, v17, #14 + drshrn_h v24, v18, v19, #14 + neg v20.2d, v20.2d + neg v21.2d, v21.2d + drshrn_h v27, v27, v26, #14 + drshrn_h v20, v20, v21, #14 + dmbutterfly_l v16, v17, v18, v19, v30, v6, v1.s[2], v1.s[3] + drshrn_h v21, v16, v17, #14 + drshrn_h v26, v18, v19, #14 + dmbutterfly_l v16, v17, v18, v19, v28, v7, v1.s[2], v1.s[3] + drshrn_h v25, v16, v17, #14 + neg v18.2d, v18.2d + neg v19.2d, v19.2d + drshrn_h v22, v18, v19, #14 + + idct32_end +endfunc + +.macro idct32_funcs suffix +// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. +// The 32-point IDCT can be decomposed into two 16-point IDCTs; +// a normal IDCT16 with every other input component (the even ones, with +// each output written twice), followed by a separate 16-point IDCT +// of the odd inputs, added/subtracted onto the outputs of the first idct16. +// x0 = dst (temp buffer) +// x1 = unused +// x2 = src +// x9 = double input stride +function idct32_1d_4x32_pass1\suffix\()_neon + mov x14, x30 + + movi v4.4s, #0 + + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr +.endif + + bl idct16\suffix + + // Do four 4x4 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 + // contain the four transposed 4x4 blocks. + transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 + transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 + transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 + + // Store the registers a, b, c, d horizontally, followed by the + // same registers d, c, b, a mirrored. +.macro store_rev a, b, c, d + // There's no rev128 instruction, but we reverse each 64 bit + // half, and then flip them using an ext with 8 bytes offset. + rev64 v7.4s, \d + st1 {\a}, [x0], #16 + ext v7.16b, v7.16b, v7.16b, #8 + st1 {\b}, [x0], #16 + rev64 v6.4s, \c + st1 {\c}, [x0], #16 + ext v6.16b, v6.16b, v6.16b, #8 + st1 {\d}, [x0], #16 + rev64 v5.4s, \b + st1 {v7.4s}, [x0], #16 + ext v5.16b, v5.16b, v5.16b, #8 + st1 {v6.4s}, [x0], #16 + rev64 v4.4s, \a + st1 {v5.4s}, [x0], #16 + ext v4.16b, v4.16b, v4.16b, #8 + st1 {v4.4s}, [x0], #16 +.endm + store_rev v16.4s, v20.4s, v24.4s, v28.4s + store_rev v17.4s, v21.4s, v25.4s, v29.4s + store_rev v18.4s, v22.4s, v26.4s, v30.4s + store_rev v19.4s, v23.4s, v27.4s, v31.4s + sub x0, x0, #512 +.purgem store_rev + + // Move x2 back to the start of the input, and move + // to the first odd row +.ifb \suffix + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half + sub x2, x2, x9, lsl #3 +.endif + add x2, x2, #128 + + movi v4.4s, #0 + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr +.endif + + bl idct32_odd\suffix + + transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7 + transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7 + transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7 + transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7 + + // Store the registers a, b, c, d horizontally, + // adding into the output first, and the mirrored, + // subtracted from the output. +.macro store_rev a, b, c, d, a16b, b16b + ld1 {v4.4s}, [x0] + rev64 v9.4s, \d + add v4.4s, v4.4s, \a + st1 {v4.4s}, [x0], #16 + rev64 v8.4s, \c + ld1 {v4.4s}, [x0] + ext v9.16b, v9.16b, v9.16b, #8 + add v4.4s, v4.4s, \b + st1 {v4.4s}, [x0], #16 + ext v8.16b, v8.16b, v8.16b, #8 + ld1 {v4.4s}, [x0] + rev64 \b, \b + add v4.4s, v4.4s, \c + st1 {v4.4s}, [x0], #16 + rev64 \a, \a + ld1 {v4.4s}, [x0] + ext \b16b, \b16b, \b16b, #8 + add v4.4s, v4.4s, \d + st1 {v4.4s}, [x0], #16 + ext \a16b, \a16b, \a16b, #8 + ld1 {v4.4s}, [x0] + sub v4.4s, v4.4s, v9.4s + st1 {v4.4s}, [x0], #16 + ld1 {v4.4s}, [x0] + sub v4.4s, v4.4s, v8.4s + st1 {v4.4s}, [x0], #16 + ld1 {v4.4s}, [x0] + sub v4.4s, v4.4s, \b + st1 {v4.4s}, [x0], #16 + ld1 {v4.4s}, [x0] + sub v4.4s, v4.4s, \a + st1 {v4.4s}, [x0], #16 +.endm + + store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b + store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b + store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b + store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b +.purgem store_rev + br x14 +endfunc + +// This is mostly the same as 4x32_pass1, but without the transpose, +// and use the source as temp buffer between the two idct passes, and +// add into the destination. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x7 = negative double temp buffer stride +// x9 = double temp buffer stride +function idct32_1d_4x32_pass2\suffix\()_neon + mov x14, x30 + + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif + + bl idct16\suffix + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + store \i, x2, x9 +.endr + + sub x2, x2, x9, lsl #4 + add x2, x2, #128 + + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif + sub x2, x2, #128 + + bl idct32_odd\suffix + +.macro load_acc_store a, b, c, d, neg=0 +.if \neg == 0 + ld1 {v4.4s}, [x2], x9 + ld1 {v5.4s}, [x2], x9 + add v4.4s, v4.4s, \a + ld1 {v6.4s}, [x2], x9 + add v5.4s, v5.4s, \b + ld1 {v7.4s}, [x2], x9 + add v6.4s, v6.4s, \c + add v7.4s, v7.4s, \d +.else + ld1 {v4.4s}, [x2], x7 + ld1 {v5.4s}, [x2], x7 + sub v4.4s, v4.4s, \a + ld1 {v6.4s}, [x2], x7 + sub v5.4s, v5.4s, \b + ld1 {v7.4s}, [x2], x7 + sub v6.4s, v6.4s, \c + sub v7.4s, v7.4s, \d +.endif + ld1 {v8.4h}, [x0], x1 + ld1 {v8.d}[1], [x0], x1 + srshr v4.4s, v4.4s, #6 + ld1 {v9.4h}, [x0], x1 + srshr v5.4s, v5.4s, #6 + uaddw v4.4s, v4.4s, v8.4h + ld1 {v9.d}[1], [x0], x1 + srshr v6.4s, v6.4s, #6 + uaddw2 v5.4s, v5.4s, v8.8h + srshr v7.4s, v7.4s, #6 + sub x0, x0, x1, lsl #2 + uaddw v6.4s, v6.4s, v9.4h + sqxtun v4.4h, v4.4s + uaddw2 v7.4s, v7.4s, v9.8h + sqxtun2 v4.8h, v5.4s + umin v4.8h, v4.8h, v15.8h + st1 {v4.4h}, [x0], x1 + sqxtun v5.4h, v6.4s + st1 {v4.d}[1], [x0], x1 + sqxtun2 v5.8h, v7.4s + umin v5.8h, v5.8h, v15.8h + st1 {v5.4h}, [x0], x1 + st1 {v5.d}[1], [x0], x1 +.endm + load_acc_store v31.4s, v30.4s, v29.4s, v28.4s + load_acc_store v27.4s, v26.4s, v25.4s, v24.4s + load_acc_store v23.4s, v22.4s, v21.4s, v20.4s + load_acc_store v19.4s, v18.4s, v17.4s, v16.4s + sub x2, x2, x9 + load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1 + load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1 + load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1 + load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1 +.purgem load_acc_store + br x14 +endfunc +.endm + +idct32_funcs +idct32_funcs _quarter +idct32_funcs _half + +const min_eob_idct_idct_32, align=4 + .short 0, 9, 34, 70, 135, 240, 336, 448 +endconst + +function vp9_idct_idct_32x32_add_16_neon + cmp w3, #1 + b.eq idct32x32_dc_add_neon + + movrel x10, idct_coeffs + + mov x15, x30 + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + + sub sp, sp, #4096 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + + // Double stride of the input, since we only read every other line + mov x9, #256 + neg x7, x9 + + ld1 {v0.8h,v1.8h}, [x10], #32 + sxtl v2.4s, v1.4h + sxtl2 v3.4s, v1.8h + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h + ld1 {v10.8h,v11.8h}, [x10] + sxtl v12.4s, v11.4h + sxtl2 v13.4s, v11.8h + sxtl2 v11.4s, v10.8h + sxtl v10.4s, v10.4h + + dup v15.8h, w13 + + cmp w3, #34 + b.le idct32x32_quarter_add_16_neon + cmp w3, #135 + b.le idct32x32_half_add_16_neon + + movrel x12, min_eob_idct_idct_32, 2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x0, sp, #(\i*128) +.if \i > 0 + ldrh w1, [x12], #2 + cmp w3, w1 + mov x1, #(32 - \i)/4 + b.le 1f +.endif + add x2, x6, #(\i*4) + bl idct32_1d_4x32_pass1_neon +.endr + b 3f + +1: + // Write zeros to the temp buffer for pass 2 + movi v16.4s, #0 + movi v17.4s, #0 + movi v18.4s, #0 + movi v19.4s, #0 +2: + subs x1, x1, #1 +.rept 4 + st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 + st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 +.endr + b.ne 2b +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x0, x4, #(\i*2) + mov x1, x5 + add x2, sp, #(\i*4) + bl idct32_1d_4x32_pass2_neon +.endr + + add sp, sp, #4096 + ldp d14, d15, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d10, d11, [sp], 0x10 + ldp d8, d9, [sp], 0x10 + + br x15 +endfunc + +function ff_vp9_idct_idct_32x32_add_10_neon, export=1 + mov x13, #0x03ff + b vp9_idct_idct_32x32_add_16_neon +endfunc + +function ff_vp9_idct_idct_32x32_add_12_neon, export=1 + mov x13, #0x0fff + b vp9_idct_idct_32x32_add_16_neon +endfunc + +.macro idct32_partial size +function idct32x32_\size\()_add_16_neon +.irp i, 0, 4 + add x0, sp, #(\i*128) +.ifc \size,quarter +.if \i == 4 + cmp w3, #9 + b.le 1f +.endif +.endif + add x2, x6, #(\i*4) + bl idct32_1d_4x32_pass1_\size\()_neon +.endr + +.ifc \size,half +.irp i, 8, 12 + add x0, sp, #(\i*128) +.if \i == 12 + cmp w3, #70 + b.le 1f +.endif + add x2, x6, #(\i*4) + bl idct32_1d_4x32_pass1_\size\()_neon +.endr +.endif + b 3f + +1: + // Write zeros to the temp buffer for pass 2 + movi v16.4s, #0 + movi v17.4s, #0 + movi v18.4s, #0 + movi v19.4s, #0 + +.rept 4 + st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 + st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 +.endr + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x0, x4, #(\i*2) + mov x1, x5 + add x2, sp, #(\i*4) + bl idct32_1d_4x32_pass2_\size\()_neon +.endr + + add sp, sp, #4096 + ldp d14, d15, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d10, d11, [sp], 0x10 + ldp d8, d9, [sp], 0x10 + + br x15 +endfunc +.endm + +idct32_partial quarter +idct32_partial half diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 3ffb418963..99413b0f70 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2016 Google Inc. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S new file mode 100644 index 0000000000..9075f3d406 --- /dev/null +++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S @@ -0,0 +1,873 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + + +.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8h, \r0\().8h, \r1\().8h + trn2 \t5\().8h, \r0\().8h, \r1\().8h + trn1 \t6\().8h, \r2\().8h, \r3\().8h + trn2 \t7\().8h, \r2\().8h, \r3\().8h + + trn1 \r0\().4s, \t4\().4s, \t6\().4s + trn2 \r2\().4s, \t4\().4s, \t6\().4s + trn1 \r1\().4s, \t5\().4s, \t7\().4s + trn2 \r3\().4s, \t5\().4s, \t7\().4s +.endm + +// The input to and output from this macro is in the registers v16-v31, +// and v0-v7 are used as scratch registers. +// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31 +// Depending on the width of the loop filter, we either use v16-v19 +// and v28-v31 as temp registers, or v8-v15. +.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 + dup v0.8h, w2 // E + dup v2.8h, w3 // I + dup v3.8h, w4 // H + + uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2) + uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1) + uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1) + uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2) + uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3) + umax v4.8h, v4.8h, v5.8h + umax v5.8h, v6.8h, v7.8h + umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h + uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0) + umax v4.8h, v4.8h, v5.8h + add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2 + uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1) + umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3)) + ushr v5.8h, v5.8h, #1 + cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I + add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + cmhs v6.8h, v0.8h, v6.8h + and v4.16b, v4.16b, v6.16b // fm + + // If no pixels need filtering, just exit as soon as possible + mov x11, v4.d[0] + mov x12, v4.d[1] + adds x11, x11, x12 + b.ne 1f + br x10 +1: + +.if \wd >= 8 + dup v0.8h, w5 + + uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) + uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) + uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0) + uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0) + uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0) + umax v6.8h, v6.8h, v2.8h + umax v1.8h, v1.8h, \tmp1\().8h + umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h +.if \wd == 16 + uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0) + umax v6.8h, v6.8h, v1.8h + uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0) + umax v6.8h, v6.8h, \tmp2\().8h + uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0) + cmhs v6.8h, v0.8h, v6.8h // flat8in + uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0) + and v6.16b, v6.16b, v4.16b // flat8in && fm + uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0) + bic v4.16b, v4.16b, v6.16b // fm && !flat8in + uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0) + uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0) + uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0) + + umax v7.8h, v7.8h, v2.8h + umax v1.8h, v1.8h, v8.8h + umax v9.8h, v9.8h, v10.8h + umax v11.8h, v11.8h, v12.8h + // The rest of the calculation of flat8out is interleaved below +.else + // The rest of the calculation of flat8in is interleaved below +.endif +.endif + + // Calculate the normal inner loop filter for 2 or 4 pixels + uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0) +.if \wd == 16 + umax v7.8h, v7.8h, v1.8h + umax v9.8h, v9.8h, v11.8h +.elseif \wd == 8 + umax v6.8h, v6.8h, v1.8h +.endif + uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) +.if \wd == 16 + umax v7.8h, v7.8h, v9.8h +.elseif \wd == 8 + umax v6.8h, v6.8h, \tmp2\().8h +.endif + dup \tmp2\().8h, w6 // left shift for saturation + sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1 + neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation + umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) + sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0 + movi \tmp5\().8h, #3 +.if \wd == 8 + cmhs v6.8h, v0.8h, v6.8h // flat8in +.endif + cmhs v5.8h, v3.8h, v5.8h // !hev +.if \wd == 8 + and v6.16b, v6.16b, v4.16b // flat8in && fm +.endif + sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h +.if \wd == 16 + cmhs v7.8h, v0.8h, v7.8h // flat8out +.elseif \wd == 8 + bic v4.16b, v4.16b, v6.16b // fm && !flat8in +.endif + and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in +.if \wd == 16 + and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm +.endif + sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1) + + mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0) + bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0 + movi v2.8h, #4 + add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] + movi v3.8h, #3 + sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h + movi \tmp5\().8h, #0 + sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f + dup \tmp6\().8h, w7 // max pixel value +.if \wd == 16 + bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out +.endif + + ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1 + + add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4 + add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3 + smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) + smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) + sshr \tmp3\().8h, \tmp3\().8h, #3 // f1 + sshr \tmp4\().8h, \tmp4\().8h, #3 // f2 + + add v0.8h, v23.8h, \tmp4\().8h // p0 + f2 + sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1 + smin v0.8h, v0.8h, \tmp6\().8h + smin v2.8h, v2.8h, \tmp6\().8h + srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1 + smax v0.8h, v0.8h, \tmp5\().8h // out p0 + smax v2.8h, v2.8h, \tmp5\().8h // out q0 + bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in) + bit v24.16b, v2.16b, v4.16b + + add v0.8h, v22.8h, \tmp3\().8h // p1 + f + sub v2.8h, v25.8h, \tmp3\().8h // q1 - f +.if \wd >= 8 + mov x11, v6.d[0] +.endif + smin v0.8h, v0.8h, \tmp6\().8h + smin v2.8h, v2.8h, \tmp6\().8h +.if \wd >= 8 + mov x12, v6.d[1] +.endif + smax v0.8h, v0.8h, \tmp5\().8h // out p1 + smax v2.8h, v2.8h, \tmp5\().8h // out q1 +.if \wd >= 8 + adds x11, x11, x12 +.endif + bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in) + bit v25.16b, v2.16b, v5.16b + + // If no pixels need flat8in, jump to flat8out + // (or to a writeout of the inner 4 pixels, for wd=8) +.if \wd >= 8 +.if \wd == 16 + b.eq 6f +.else + b.ne 1f + br x13 +1: +.endif + + // flat8in + add \tmp1\().8h, v20.8h, v21.8h + add \tmp3\().8h, v22.8h, v25.8h + add \tmp5\().8h, v20.8h, v22.8h + add \tmp7\().8h, v23.8h, v26.8h + add v0.8h, \tmp1\().8h, \tmp1\().8h + add v0.8h, v0.8h, v23.8h + add v0.8h, v0.8h, v24.8h + add v0.8h, v0.8h, \tmp5\().8h + sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h + sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h + urshr v2.8h, v0.8h, #3 // out p2 + + add v0.8h, v0.8h, \tmp3\().8h + add \tmp1\().8h, v20.8h, v23.8h + add \tmp3\().8h, v24.8h, v27.8h + urshr v3.8h, v0.8h, #3 // out p1 + + add v0.8h, v0.8h, \tmp7\().8h + sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h + add \tmp5\().8h, v21.8h, v24.8h + add \tmp7\().8h, v25.8h, v27.8h + urshr v4.8h, v0.8h, #3 // out p0 + + add v0.8h, v0.8h, \tmp3\().8h + sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h + add \tmp1\().8h, v22.8h, v25.8h + add \tmp3\().8h, v26.8h, v27.8h + urshr v5.8h, v0.8h, #3 // out q0 + + add v0.8h, v0.8h, \tmp7\().8h + sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h + urshr \tmp5\().8h, v0.8h, #3 // out q1 + + add v0.8h, v0.8h, \tmp3\().8h + // The output here is written back into the input registers. This doesn't + // matter for the flat8part below, since we only update those pixels + // which won't be touched below. + bit v21.16b, v2.16b, v6.16b + bit v22.16b, v3.16b, v6.16b + bit v23.16b, v4.16b, v6.16b + urshr \tmp6\().8h, v0.8h, #3 // out q2 + bit v24.16b, v5.16b, v6.16b + bit v25.16b, \tmp5\().16b, v6.16b + bit v26.16b, \tmp6\().16b, v6.16b +.endif +.if \wd == 16 +6: + orr v2.16b, v6.16b, v7.16b + mov x11, v2.d[0] + mov x12, v2.d[1] + adds x11, x11, x12 + b.ne 1f + // If no pixels needed flat8in nor flat8out, jump to a + // writeout of the inner 4 pixels + br x14 +1: + + mov x11, v7.d[0] + mov x12, v7.d[1] + adds x11, x11, x12 + b.ne 1f + // If no pixels need flat8out, jump to a writeout of the inner 6 pixels + br x15 + +1: + // flat8out + // This writes all outputs into v2-v17 (skipping v6 and v16). + // If this part is skipped, the output is read from v21-v26 (which is the input + // to this section). + shl v0.8h, v16.8h, #3 // 8 * v16 + sub v0.8h, v0.8h, v16.8h // 7 * v16 + add v0.8h, v0.8h, v17.8h + add v8.8h, v17.8h, v18.8h + add v10.8h, v19.8h, v20.8h + add v0.8h, v0.8h, v8.8h + add v8.8h, v16.8h, v17.8h + add v12.8h, v21.8h, v22.8h + add v0.8h, v0.8h, v10.8h + add v10.8h, v18.8h, v25.8h + add v14.8h, v23.8h, v24.8h + sub v10.8h, v10.8h, v8.8h + add v0.8h, v0.8h, v12.8h + add v0.8h, v0.8h, v14.8h + add v12.8h, v16.8h, v18.8h + add v14.8h, v19.8h, v26.8h + urshr v2.8h, v0.8h, #4 + + add v0.8h, v0.8h, v10.8h + add v8.8h, v16.8h, v19.8h + add v10.8h, v20.8h, v27.8h + sub v14.8h, v14.8h, v12.8h + bif v2.16b, v17.16b, v7.16b + urshr v3.8h , v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + add v12.8h, v16.8h, v20.8h + add v14.8h, v21.8h, v28.8h + sub v10.8h, v10.8h, v8.8h + bif v3.16b, v18.16b, v7.16b + urshr v4.8h, v0.8h, #4 + + add v0.8h, v0.8h, v10.8h + add v8.8h, v16.8h, v21.8h + add v10.8h, v22.8h, v29.8h + sub v14.8h, v14.8h, v12.8h + bif v4.16b, v19.16b, v7.16b + urshr v5.8h, v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + add v12.8h, v16.8h, v22.8h + add v14.8h, v23.8h, v30.8h + sub v10.8h, v10.8h, v8.8h + bif v5.16b, v20.16b, v7.16b + urshr v6.8h, v0.8h, #4 + + add v0.8h, v0.8h, v10.8h + add v10.8h, v16.8h, v23.8h + sub v14.8h, v14.8h, v12.8h + add v12.8h, v24.8h, v31.8h + bif v6.16b, v21.16b, v7.16b + urshr v8.8h, v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + sub v10.8h, v12.8h, v10.8h + add v12.8h, v17.8h, v24.8h + add v14.8h, v25.8h, v31.8h + bif v8.16b, v22.16b, v7.16b + urshr v9.8h, v0.8h, #4 + + add v0.8h, v0.8h, v10.8h + sub v14.8h, v14.8h, v12.8h + add v12.8h, v26.8h, v31.8h + bif v9.16b, v23.16b, v7.16b + urshr v10.8h, v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + add v14.8h, v18.8h, v25.8h + add v18.8h, v19.8h, v26.8h + sub v12.8h, v12.8h, v14.8h + add v14.8h, v27.8h, v31.8h + bif v10.16b, v24.16b, v7.16b + urshr v11.8h, v0.8h, #4 + + add v0.8h, v0.8h, v12.8h + add v12.8h, v20.8h, v27.8h + sub v14.8h, v14.8h, v18.8h + add v18.8h, v28.8h, v31.8h + bif v11.16b, v25.16b, v7.16b + sub v18.8h, v18.8h, v12.8h + urshr v12.8h, v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + add v14.8h, v21.8h, v28.8h + add v20.8h, v29.8h, v31.8h + bif v12.16b, v26.16b, v7.16b + urshr v13.8h, v0.8h, #4 + + add v0.8h, v0.8h, v18.8h + sub v20.8h, v20.8h, v14.8h + add v18.8h, v22.8h, v29.8h + add v22.8h, v30.8h, v31.8h + bif v13.16b, v27.16b, v7.16b + urshr v14.8h, v0.8h, #4 + + add v0.8h, v0.8h, v20.8h + sub v22.8h, v22.8h, v18.8h + bif v14.16b, v28.16b, v7.16b + urshr v15.8h, v0.8h, #4 + + add v0.8h, v0.8h, v22.8h + bif v15.16b, v29.16b, v7.16b + urshr v17.8h, v0.8h, #4 + bif v17.16b, v30.16b, v7.16b +.endif +.endm + +// For wd <= 8, we use v16-v19 and v28-v31 for temp registers, +// while we need those for inputs/outputs in wd=16 and use v8-v15 +// for temp registers there instead. +function vp9_loop_filter_4 + loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31 + ret +endfunc + +function vp9_loop_filter_8 + loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31 + ret +endfunc + +function vp9_loop_filter_16 + loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15 + ret +endfunc + +.macro loop_filter_4 + bl vp9_loop_filter_4 +.endm + +.macro loop_filter_8 + // calculate alternative 'return' targets + adr x13, 6f + bl vp9_loop_filter_8 +.endm + +.macro loop_filter_16 + // calculate alternative 'return' targets + adr x14, 7f + adr x15, 8f + bl vp9_loop_filter_16 +.endm + + +// The public functions in this file have got the following signature: +// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); + +.macro bpp_frontend func, bpp, push +function ff_\func\()_\bpp\()_neon, export=1 +.if \push + mov x16, x30 + stp d14, d15, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! +.endif + lsl w2, w2, #\bpp - 8 + lsl w3, w3, #\bpp - 8 + lsl w4, w4, #\bpp - 8 + mov x5, #1 << (\bpp - 8) + mov x6, #16 - \bpp + mov x7, #((1 << \bpp) - 1) +.if \push + bl \func\()_16_neon + ldp d8, d9, [sp], 0x10 + ldp d10, d11, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d14, d15, [sp], 0x10 + br x16 +.else + b \func\()_16_neon +.endif +endfunc +.endm + +.macro bpp_frontends func, push=0 + bpp_frontend \func, 10, \push + bpp_frontend \func, 12, \push +.endm + +.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push +function ff_\func\()_\suffix\()_\bpp\()_neon, export=1 + mov x16, x30 +.if \push + stp d14, d15, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! +.endif + lsl w2, w2, #\bpp - 8 + lsl w3, w3, #\bpp - 8 + lsl w4, w4, #\bpp - 8 + mov x5, #1 << (\bpp - 8) + mov x6, #16 - \bpp + mov x7, #((1 << \bpp) - 1) + bl \func\()_\int_suffix\()_16_neon +.ifc \dir,h + add x0, x0, x1, lsl #3 +.else + add x0, x0, #16 +.endif + bl \func\()_\int_suffix\()_16_neon +.if \push + ldp d8, d9, [sp], 0x10 + ldp d10, d11, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d14, d15, [sp], 0x10 +.endif + br x16 +endfunc +.endm + +.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0 + bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push + bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push +.endm + +.macro bpp_frontend_mix2 wd1, wd2, dir, bpp +function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1 + mov x16, x30 + lsr w8, w2, #8 + lsr w14, w3, #8 + lsr w15, w4, #8 + and w2, w2, #0xff + and w3, w3, #0xff + and w4, w4, #0xff + lsl w2, w2, #\bpp - 8 + lsl w3, w3, #\bpp - 8 + lsl w4, w4, #\bpp - 8 + mov x5, #1 << (\bpp - 8) + mov x6, #16 - \bpp + mov x7, #((1 << \bpp) - 1) + bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon +.ifc \dir,h + add x0, x0, x1, lsl #3 +.else + add x0, x0, #16 +.endif + lsl w2, w8, #\bpp - 8 + lsl w3, w14, #\bpp - 8 + lsl w4, w15, #\bpp - 8 + bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon + br x16 +endfunc +.endm + +.macro bpp_frontends_mix2 wd1, wd2 + bpp_frontend_mix2 \wd1, \wd2, v, 10 + bpp_frontend_mix2 \wd1, \wd2, v, 12 + bpp_frontend_mix2 \wd1, \wd2, h, 10 + bpp_frontend_mix2 \wd1, \wd2, h, 12 +.endm + +function vp9_loop_filter_v_4_8_16_neon + mov x10, x30 + sub x9, x0, x1, lsl #2 + ld1 {v20.8h}, [x9], x1 // p3 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v21.8h}, [x9], x1 // p2 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v22.8h}, [x9], x1 // p1 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v23.8h}, [x9], x1 // p0 + ld1 {v27.8h}, [x0], x1 // q3 + sub x0, x0, x1, lsl #2 + sub x9, x9, x1, lsl #1 + + loop_filter_4 + + st1 {v22.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + + br x10 +endfunc + +bpp_frontends vp9_loop_filter_v_4_8 + +function vp9_loop_filter_h_4_8_16_neon + mov x10, x30 + sub x9, x0, #8 + add x0, x9, x1, lsl #2 + ld1 {v20.8h}, [x9], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x9], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x9], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x9], x1 + ld1 {v27.8h}, [x0], x1 + + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + + transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + loop_filter_4 + + // Move x9 forward by 2 pixels; we don't need to rewrite the + // outermost 2 pixels since they aren't changed. + add x9, x9, #4 + add x0, x9, x1, lsl #2 + + // We only will write the mid 4 pixels back; after the loop filter, + // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels). + // We need to transpose them to columns, done with a 4x8 transpose + // (which in practice is two 4x4 transposes of the two 4x4 halves + // of the 8x4 pixels; into 4x8 pixels). + transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.d}[0], [x9], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x9], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x9], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x9], x1 + st1 {v25.d}[1], [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #4 + + br x10 +endfunc + +bpp_frontends vp9_loop_filter_h_4_8 + +function vp9_loop_filter_v_8_8_16_neon + mov x10, x30 + sub x9, x0, x1, lsl #2 + ld1 {v20.8h}, [x9], x1 // p3 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v21.8h}, [x9], x1 // p2 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v22.8h}, [x9], x1 // p1 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v23.8h}, [x9], x1 // p0 + ld1 {v27.8h}, [x0], x1 // q3 + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #2 + add x9, x9, x1 + + loop_filter_8 + + st1 {v21.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v22.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v26.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + + br x10 +6: + sub x9, x0, x1, lsl #1 + st1 {v22.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + br x10 +endfunc + +bpp_frontends vp9_loop_filter_v_8_8 + +function vp9_loop_filter_h_8_8_16_neon + mov x10, x30 + sub x9, x0, #8 + add x0, x9, x1, lsl #2 + ld1 {v20.8h}, [x9], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x9], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x9], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x9], x1 + ld1 {v27.8h}, [x0], x1 + + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + + transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + loop_filter_8 + + add x0, x9, x1, lsl #2 + + // Even though only 6 pixels per row have been changed, we write the + // full 8 pixel registers. + transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + st1 {v20.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x9], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v27.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + + br x10 +6: + // If we didn't need to do the flat8in part, we use the same writeback + // as in loop_filter_h_4_8. + add x9, x9, #4 + add x0, x9, x1, lsl #2 + transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.d}[0], [x9], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x9], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x9], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x9], x1 + st1 {v25.d}[1], [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #4 + br x10 +endfunc + +bpp_frontends vp9_loop_filter_h_8_8 + +bpp_frontends_mix2 4, 4 +bpp_frontends_mix2 4, 8 +bpp_frontends_mix2 8, 4 +bpp_frontends_mix2 8, 8 + +function vp9_loop_filter_v_16_8_16_neon + mov x10, x30 + sub x9, x0, x1, lsl #3 + ld1 {v16.8h}, [x9], x1 // p7 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v17.8h}, [x9], x1 // p6 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v18.8h}, [x9], x1 // p5 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v19.8h}, [x9], x1 // p4 + ld1 {v27.8h}, [x0], x1 // q3 + ld1 {v20.8h}, [x9], x1 // p3 + ld1 {v28.8h}, [x0], x1 // q4 + ld1 {v21.8h}, [x9], x1 // p2 + ld1 {v29.8h}, [x0], x1 // q5 + ld1 {v22.8h}, [x9], x1 // p1 + ld1 {v30.8h}, [x0], x1 // q6 + ld1 {v23.8h}, [x9], x1 // p0 + ld1 {v31.8h}, [x0], x1 // q7 + sub x9, x9, x1, lsl #3 + sub x0, x0, x1, lsl #3 + add x9, x9, x1 + + loop_filter_16 + + // If we did the flat8out part, we get the output in + // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride, + // store v2-v9 there, and v10-v17 into x0. + st1 {v2.8h}, [x9], x1 + st1 {v10.8h}, [x0], x1 + st1 {v3.8h}, [x9], x1 + st1 {v11.8h}, [x0], x1 + st1 {v4.8h}, [x9], x1 + st1 {v12.8h}, [x0], x1 + st1 {v5.8h}, [x9], x1 + st1 {v13.8h}, [x0], x1 + st1 {v6.8h}, [x9], x1 + st1 {v14.8h}, [x0], x1 + st1 {v8.8h}, [x9], x1 + st1 {v15.8h}, [x0], x1 + st1 {v9.8h}, [x9], x1 + st1 {v17.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, x1 + + br x10 +8: + add x9, x9, x1, lsl #2 + // If we didn't do the flat8out part, the output is left in the + // input registers. + st1 {v21.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v22.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v26.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + br x10 +7: + sub x9, x0, x1, lsl #1 + st1 {v22.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + br x10 +endfunc + +bpp_frontends vp9_loop_filter_v_16_8, push=1 +bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1 + +function vp9_loop_filter_h_16_8_16_neon + mov x10, x30 + sub x9, x0, #16 + ld1 {v16.8h}, [x9], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v17.8h}, [x9], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v18.8h}, [x9], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v19.8h}, [x9], x1 + ld1 {v27.8h}, [x0], x1 + ld1 {v20.8h}, [x9], x1 + ld1 {v28.8h}, [x0], x1 + ld1 {v21.8h}, [x9], x1 + ld1 {v29.8h}, [x0], x1 + ld1 {v22.8h}, [x9], x1 + ld1 {v30.8h}, [x0], x1 + ld1 {v23.8h}, [x9], x1 + ld1 {v31.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + sub x9, x9, x1, lsl #3 + + // The 16x8 pixels read above is in two 8x8 blocks; the left + // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes + // of this, to get one column per register. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + loop_filter_16 + + transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 + transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 + + st1 {v16.8h}, [x9], x1 + st1 {v10.8h}, [x0], x1 + st1 {v2.8h}, [x9], x1 + st1 {v11.8h}, [x0], x1 + st1 {v3.8h}, [x9], x1 + st1 {v12.8h}, [x0], x1 + st1 {v4.8h}, [x9], x1 + st1 {v13.8h}, [x0], x1 + st1 {v5.8h}, [x9], x1 + st1 {v14.8h}, [x0], x1 + st1 {v6.8h}, [x9], x1 + st1 {v15.8h}, [x0], x1 + st1 {v8.8h}, [x9], x1 + st1 {v17.8h}, [x0], x1 + st1 {v9.8h}, [x9], x1 + st1 {v31.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + + br x10 +8: + // The same writeback as in loop_filter_h_8_8 + sub x9, x0, #8 + add x0, x9, x1, lsl #2 + transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + st1 {v20.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x9], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v27.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + br x10 +7: + // The same writeback as in loop_filter_h_4_8 + sub x9, x0, #4 + add x0, x9, x1, lsl #2 + transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.d}[0], [x9], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x9], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x9], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x9], x1 + st1 {v25.d}[1], [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #4 + br x10 +endfunc + +bpp_frontends vp9_loop_filter_h_16_8, push=1 +bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1 diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index e9c497096b..0878763020 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2016 Google Inc. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavcodec/aarch64/vp9mc_16bpp_neon.S b/libavcodec/aarch64/vp9mc_16bpp_neon.S new file mode 100644 index 0000000000..cac6428709 --- /dev/null +++ b/libavcodec/aarch64/vp9mc_16bpp_neon.S @@ -0,0 +1,631 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// All public functions in this file have the following signature: +// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, +// const uint8_t *ref, ptrdiff_t ref_stride, +// int h, int mx, int my); + +function ff_vp9_copy128_aarch64, export=1 +1: + ldp x5, x6, [x2] + ldp x7, x8, [x2, #16] + stp x5, x6, [x0] + ldp x9, x10, [x2, #32] + stp x7, x8, [x0, #16] + subs w4, w4, #1 + ldp x11, x12, [x2, #48] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + ldp x5, x6, [x2, #64] + ldp x7, x8, [x2, #80] + stp x5, x6, [x0, #64] + ldp x9, x10, [x2, #96] + stp x7, x8, [x0, #80] + ldp x11, x12, [x2, #112] + stp x9, x10, [x0, #96] + stp x11, x12, [x0, #112] + add x2, x2, x3 + add x0, x0, x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg64_16_neon, export=1 + mov x5, x0 + sub x1, x1, #64 + sub x3, x3, #64 +1: + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3 + urhadd v0.8h, v0.8h, v4.8h + urhadd v1.8h, v1.8h, v5.8h + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1 + urhadd v2.8h, v2.8h, v6.8h + urhadd v3.8h, v3.8h, v7.8h + subs w4, w4, #1 + urhadd v16.8h, v16.8h, v20.8h + urhadd v17.8h, v17.8h, v21.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64 + urhadd v18.8h, v18.8h, v22.8h + urhadd v19.8h, v19.8h, v23.8h + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg32_16_neon, export=1 + mov x5, x0 +1: + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3 + urhadd v0.8h, v0.8h, v4.8h + urhadd v1.8h, v1.8h, v5.8h + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1 + urhadd v2.8h, v2.8h, v6.8h + urhadd v3.8h, v3.8h, v7.8h + subs w4, w4, #2 + urhadd v16.8h, v16.8h, v20.8h + urhadd v17.8h, v17.8h, v21.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1 + urhadd v18.8h, v18.8h, v22.8h + urhadd v19.8h, v19.8h, v23.8h + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg16_16_neon, export=1 +1: + ld1 {v2.8h, v3.8h}, [x2], x3 + ld1 {v0.8h, v1.8h}, [x0] + urhadd v0.8h, v0.8h, v2.8h + urhadd v1.8h, v1.8h, v3.8h + subs w4, w4, #1 + st1 {v0.8h, v1.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg8_16_neon, export=1 + mov x5, x0 +1: + ld1 {v2.8h}, [x2], x3 + ld1 {v0.8h}, [x0], x1 + ld1 {v3.8h}, [x2], x3 + urhadd v0.8h, v0.8h, v2.8h + ld1 {v1.8h}, [x0], x1 + urhadd v1.8h, v1.8h, v3.8h + subs w4, w4, #2 + st1 {v0.8h}, [x5], x1 + st1 {v1.8h}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg4_16_neon, export=1 + mov x5, x0 +1: + ld1 {v2.4h}, [x2], x3 + ld1 {v0.4h}, [x0], x1 + ld1 {v3.4h}, [x2], x3 + urhadd v0.4h, v0.4h, v2.4h + ld1 {v1.4h}, [x0], x1 + urhadd v1.4h, v1.4h, v3.4h + subs w4, w4, #2 + st1 {v0.4h}, [x5], x1 + st1 {v1.8b}, [x5], x1 + b.ne 1b + ret +endfunc + + +// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 +// for size >= 16), and multiply-accumulate into dst1 and dst5 (or +// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8 +// for size >= 16) +.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size + ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) + ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) + smlal \dst1\().4s, v20.4h, v0.h[\offset] + smlal \dst5\().4s, v22.4h, v0.h[\offset] +.if \size >= 16 + ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) + ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) +.endif +.if \size >= 8 + smlal2 \dst2\().4s, v20.8h, v0.h[\offset] + smlal2 \dst6\().4s, v22.8h, v0.h[\offset] +.endif +.if \size >= 16 + smlal \dst3\().4s, v21.4h, v0.h[\offset] + smlal \dst7\().4s, v23.4h, v0.h[\offset] + smlal2 \dst4\().4s, v21.8h, v0.h[\offset] + smlal2 \dst8\().4s, v23.8h, v0.h[\offset] +.endif +.endm + + +// Instantiate a horizontal filter function for the given size. +// This can work on 4, 8 or 16 pixels in parallel; for larger +// widths it will do 16 pixels at a time and loop horizontally. +// The actual width (in bytes) is passed in x5, the height in w4 and +// the filter coefficients in x9. +.macro do_8tap_h type, size +function \type\()_8tap_\size\()h + sub x2, x2, #6 + add x6, x0, x1 + add x7, x2, x3 + add x1, x1, x1 + add x3, x3, x3 + // Only size >= 16 loops horizontally and needs + // reduced dst stride +.if \size >= 16 + sub x1, x1, x5 +.endif + // size >= 16 loads two qwords and increments r2, + // for size 4/8 it's enough with one qword and no + // postincrement +.if \size >= 16 + sub x3, x3, x5 + sub x3, x3, #16 +.endif + // Load the filter vector + ld1 {v0.8h}, [x9] +1: +.if \size >= 16 + mov x9, x5 +.endif + // Load src +.if \size >= 16 + ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48 + ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48 +.else + ld1 {v5.8h, v6.8h}, [x2] + ld1 {v16.8h, v17.8h}, [x7] +.endif +2: + + smull v1.4s, v5.4h, v0.h[0] + smull v24.4s, v16.4h, v0.h[0] +.if \size >= 8 + smull2 v2.4s, v5.8h, v0.h[0] + smull2 v25.4s, v16.8h, v0.h[0] +.endif +.if \size >= 16 + smull v3.4s, v6.4h, v0.h[0] + smull v26.4s, v17.4h, v0.h[0] + smull2 v4.4s, v6.8h, v0.h[0] + smull2 v27.4s, v17.8h, v0.h[0] +.endif + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size + + // Round, shift and saturate + // The sqrshrun takes care of clamping negative values to zero, but + // we manually need to do umin with the max pixel value. + sqrshrun v1.4h, v1.4s, #7 + sqrshrun v24.4h, v24.4s, #7 +.if \size >= 8 + sqrshrun2 v1.8h, v2.4s, #7 + sqrshrun2 v24.8h, v25.4s, #7 + umin v1.8h, v1.8h, v31.8h + umin v24.8h, v24.8h, v31.8h +.if \size >= 16 + sqrshrun v2.4h, v3.4s, #7 + sqrshrun v25.4h, v26.4s, #7 + sqrshrun2 v2.8h, v4.4s, #7 + sqrshrun2 v25.8h, v27.4s, #7 + umin v2.8h, v2.8h, v31.8h + umin v25.8h, v25.8h, v31.8h +.endif +.else + umin v1.4h, v1.4h, v31.4h + umin v24.4h, v24.4h, v31.4h +.endif + // Average +.ifc \type,avg +.if \size >= 16 + ld1 {v3.8h, v4.8h}, [x0] + ld1 {v29.8h, v30.8h}, [x6] + urhadd v1.8h, v1.8h, v3.8h + urhadd v2.8h, v2.8h, v4.8h + urhadd v24.8h, v24.8h, v29.8h + urhadd v25.8h, v25.8h, v30.8h +.elseif \size >= 8 + ld1 {v3.8h}, [x0] + ld1 {v4.8h}, [x6] + urhadd v1.8h, v1.8h, v3.8h + urhadd v24.8h, v24.8h, v4.8h +.else + ld1 {v3.4h}, [x0] + ld1 {v4.4h}, [x6] + urhadd v1.4h, v1.4h, v3.4h + urhadd v24.4h, v24.4h, v4.4h +.endif +.endif + // Store and loop horizontally (for size >= 16) +.if \size >= 16 + subs x9, x9, #32 + st1 {v1.8h, v2.8h}, [x0], #32 + st1 {v24.8h, v25.8h}, [x6], #32 + b.eq 3f + mov v5.16b, v7.16b + mov v16.16b, v18.16b + ld1 {v6.8h, v7.8h}, [x2], #32 + ld1 {v17.8h, v18.8h}, [x7], #32 + b 2b +.elseif \size == 8 + st1 {v1.8h}, [x0] + st1 {v24.8h}, [x6] +.else // \size == 4 + st1 {v1.4h}, [x0] + st1 {v24.4h}, [x6] +.endif +3: + // Loop vertically + add x0, x0, x1 + add x6, x6, x1 + add x2, x2, x3 + add x7, x7, x3 + subs w4, w4, #2 + b.ne 1b + ret +endfunc +.endm + +.macro do_8tap_h_size size +do_8tap_h put, \size +do_8tap_h avg, \size +.endm + +do_8tap_h_size 4 +do_8tap_h_size 8 +do_8tap_h_size 16 + +.macro do_8tap_h_func type, filter, offset, size, bpp +function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1 + mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 + movrel x6, X(ff_vp9_subpel_filters), 256*\offset + cmp w5, #8 + add x9, x6, w5, uxtw #4 + mov x5, #2*\size +.if \size >= 16 + b \type\()_8tap_16h +.else + b \type\()_8tap_\size\()h +.endif +endfunc +.endm + +.macro do_8tap_h_filters size, bpp +do_8tap_h_func put, regular, 1, \size, \bpp +do_8tap_h_func avg, regular, 1, \size, \bpp +do_8tap_h_func put, sharp, 2, \size, \bpp +do_8tap_h_func avg, sharp, 2, \size, \bpp +do_8tap_h_func put, smooth, 0, \size, \bpp +do_8tap_h_func avg, smooth, 0, \size, \bpp +.endm + +.macro do_8tap_h_filters_bpp bpp +do_8tap_h_filters 64, \bpp +do_8tap_h_filters 32, \bpp +do_8tap_h_filters 16, \bpp +do_8tap_h_filters 8, \bpp +do_8tap_h_filters 4, \bpp +.endm + +do_8tap_h_filters_bpp 10 +do_8tap_h_filters_bpp 12 + + +// Vertical filters + +// Round, shift and saturate and store reg1-reg4 +.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type + sqrshrun \reg1\().4h, \reg1\().4s, #7 + sqrshrun \reg2\().4h, \reg2\().4s, #7 + sqrshrun \reg3\().4h, \reg3\().4s, #7 + sqrshrun \reg4\().4h, \reg4\().4s, #7 +.ifc \type,avg + ld1 {\tmp1\().4h}, [x7], x1 + ld1 {\tmp2\().4h}, [x7], x1 + ld1 {\tmp3\().4h}, [x7], x1 + ld1 {\tmp4\().4h}, [x7], x1 +.endif + umin \reg1\().4h, \reg1\().4h, \minreg\().4h + umin \reg2\().4h, \reg2\().4h, \minreg\().4h + umin \reg3\().4h, \reg3\().4h, \minreg\().4h + umin \reg4\().4h, \reg4\().4h, \minreg\().4h +.ifc \type,avg + urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h + urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h + urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h + urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h +.endif + st1 {\reg1\().4h}, [x0], x1 + st1 {\reg2\().4h}, [x0], x1 + st1 {\reg3\().4h}, [x0], x1 + st1 {\reg4\().4h}, [x0], x1 +.endm + +// Round, shift and saturate and store reg1-8, where +// reg1-2, reg3-4 etc pairwise correspond to 4 rows. +.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type + sqrshrun \reg1\().4h, \reg1\().4s, #7 + sqrshrun2 \reg1\().8h, \reg2\().4s, #7 + sqrshrun \reg2\().4h, \reg3\().4s, #7 + sqrshrun2 \reg2\().8h, \reg4\().4s, #7 + sqrshrun \reg3\().4h, \reg5\().4s, #7 + sqrshrun2 \reg3\().8h, \reg6\().4s, #7 + sqrshrun \reg4\().4h, \reg7\().4s, #7 + sqrshrun2 \reg4\().8h, \reg8\().4s, #7 +.ifc \type,avg + ld1 {\reg5\().8h}, [x7], x1 + ld1 {\reg6\().8h}, [x7], x1 + ld1 {\reg7\().8h}, [x7], x1 + ld1 {\reg8\().8h}, [x7], x1 +.endif + umin \reg1\().8h, \reg1\().8h, \minreg\().8h + umin \reg2\().8h, \reg2\().8h, \minreg\().8h + umin \reg3\().8h, \reg3\().8h, \minreg\().8h + umin \reg4\().8h, \reg4\().8h, \minreg\().8h +.ifc \type,avg + urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h + urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h + urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h + urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h +.endif + st1 {\reg1\().8h}, [x0], x1 + st1 {\reg2\().8h}, [x0], x1 + st1 {\reg3\().8h}, [x0], x1 + st1 {\reg4\().8h}, [x0], x1 +.endm + +// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 +// (src1-src8 into dst1, src2-src9 into dst2). +.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2 + smull \dst1\().4s, \src1\().4h, v0.h[0] + smull \dst2\().4s, \src2\().4h, v0.h[0] + smull \tmp1\().4s, \src2\().4h, v0.h[1] + smull \tmp2\().4s, \src3\().4h, v0.h[1] + smlal \dst1\().4s, \src3\().4h, v0.h[2] + smlal \dst2\().4s, \src4\().4h, v0.h[2] + smlal \tmp1\().4s, \src4\().4h, v0.h[3] + smlal \tmp2\().4s, \src5\().4h, v0.h[3] + smlal \dst1\().4s, \src5\().4h, v0.h[4] + smlal \dst2\().4s, \src6\().4h, v0.h[4] + smlal \tmp1\().4s, \src6\().4h, v0.h[5] + smlal \tmp2\().4s, \src7\().4h, v0.h[5] + smlal \dst1\().4s, \src7\().4h, v0.h[6] + smlal \dst2\().4s, \src8\().4h, v0.h[6] + smlal \tmp1\().4s, \src8\().4h, v0.h[7] + smlal \tmp2\().4s, \src9\().4h, v0.h[7] + add \dst1\().4s, \dst1\().4s, \tmp1\().4s + add \dst2\().4s, \dst2\().4s, \tmp2\().4s +.endm + +// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4 +// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4). +.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9 + smull \dst1\().4s, \src1\().4h, v0.h[0] + smull2 \dst2\().4s, \src1\().8h, v0.h[0] + smull \dst3\().4s, \src2\().4h, v0.h[0] + smull2 \dst4\().4s, \src2\().8h, v0.h[0] + smlal \dst1\().4s, \src2\().4h, v0.h[1] + smlal2 \dst2\().4s, \src2\().8h, v0.h[1] + smlal \dst3\().4s, \src3\().4h, v0.h[1] + smlal2 \dst4\().4s, \src3\().8h, v0.h[1] + smlal \dst1\().4s, \src3\().4h, v0.h[2] + smlal2 \dst2\().4s, \src3\().8h, v0.h[2] + smlal \dst3\().4s, \src4\().4h, v0.h[2] + smlal2 \dst4\().4s, \src4\().8h, v0.h[2] + smlal \dst1\().4s, \src4\().4h, v0.h[3] + smlal2 \dst2\().4s, \src4\().8h, v0.h[3] + smlal \dst3\().4s, \src5\().4h, v0.h[3] + smlal2 \dst4\().4s, \src5\().8h, v0.h[3] + smlal \dst1\().4s, \src5\().4h, v0.h[4] + smlal2 \dst2\().4s, \src5\().8h, v0.h[4] + smlal \dst3\().4s, \src6\().4h, v0.h[4] + smlal2 \dst4\().4s, \src6\().8h, v0.h[4] + smlal \dst1\().4s, \src6\().4h, v0.h[5] + smlal2 \dst2\().4s, \src6\().8h, v0.h[5] + smlal \dst3\().4s, \src7\().4h, v0.h[5] + smlal2 \dst4\().4s, \src7\().8h, v0.h[5] + smlal \dst1\().4s, \src7\().4h, v0.h[6] + smlal2 \dst2\().4s, \src7\().8h, v0.h[6] + smlal \dst3\().4s, \src8\().4h, v0.h[6] + smlal2 \dst4\().4s, \src8\().8h, v0.h[6] + smlal \dst1\().4s, \src8\().4h, v0.h[7] + smlal2 \dst2\().4s, \src8\().8h, v0.h[7] + smlal \dst3\().4s, \src9\().4h, v0.h[7] + smlal2 \dst4\().4s, \src9\().8h, v0.h[7] +.endm + +// Instantiate a vertical filter function for filtering 8 pixels at a time. +// The height is passed in x4, the width in x5 and the filter coefficients +// in x6. +.macro do_8tap_8v type +function \type\()_8tap_8v + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + ld1 {v0.8h}, [x6] +1: +.ifc \type,avg + mov x7, x0 +.endif + mov x6, x4 + + ld1 {v17.8h}, [x2], x3 + ld1 {v18.8h}, [x2], x3 + ld1 {v19.8h}, [x2], x3 + ld1 {v20.8h}, [x2], x3 + ld1 {v21.8h}, [x2], x3 + ld1 {v22.8h}, [x2], x3 + ld1 {v23.8h}, [x2], x3 +2: + ld1 {v24.8h}, [x2], x3 + ld1 {v25.8h}, [x2], x3 + ld1 {v26.8h}, [x2], x3 + ld1 {v27.8h}, [x2], x3 + + convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25 + convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27 + do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type + + subs x6, x6, #4 + b.eq 8f + + ld1 {v16.8h}, [x2], x3 + ld1 {v17.8h}, [x2], x3 + ld1 {v18.8h}, [x2], x3 + ld1 {v19.8h}, [x2], x3 + convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17 + convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19 + do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type + + subs x6, x6, #4 + b.eq 8f + + ld1 {v20.8h}, [x2], x3 + ld1 {v21.8h}, [x2], x3 + ld1 {v22.8h}, [x2], x3 + ld1 {v23.8h}, [x2], x3 + convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21 + convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23 + do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type + + subs x6, x6, #4 + b.ne 2b + +8: + subs x5, x5, #8 + b.eq 9f + // x0 -= h * dst_stride + msub x0, x1, x4, x0 + // x2 -= h * src_stride + msub x2, x3, x4, x2 + // x2 -= 8 * src_stride + sub x2, x2, x3, lsl #3 + // x2 += 1 * src_stride + add x2, x2, x3 + add x2, x2, #16 + add x0, x0, #16 + b 1b +9: + ret +endfunc +.endm + +do_8tap_8v put +do_8tap_8v avg + + +// Instantiate a vertical filter function for filtering a 4 pixels wide +// slice. This only is designed to work for 4 or 8 output lines. +.macro do_8tap_4v type +function \type\()_8tap_4v + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + ld1 {v0.8h}, [x6] +.ifc \type,avg + mov x7, x0 +.endif + + ld1 {v16.4h}, [x2], x3 + ld1 {v17.4h}, [x2], x3 + ld1 {v18.4h}, [x2], x3 + ld1 {v19.4h}, [x2], x3 + ld1 {v20.4h}, [x2], x3 + ld1 {v21.4h}, [x2], x3 + ld1 {v22.4h}, [x2], x3 + ld1 {v23.4h}, [x2], x3 + ld1 {v24.4h}, [x2], x3 + ld1 {v25.4h}, [x2], x3 + ld1 {v26.4h}, [x2], x3 + + convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31 + convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31 + do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type + + subs x4, x4, #4 + b.eq 9f + + ld1 {v27.4h}, [x2], x3 + ld1 {v28.4h}, [x2], x3 + ld1 {v29.4h}, [x2], x3 + ld1 {v30.4h}, [x2], x3 + + convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17 + convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17 + do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type + +9: + ret +endfunc +.endm + +do_8tap_4v put +do_8tap_4v avg + + +.macro do_8tap_v_func type, filter, offset, size, bpp +function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1 + uxtw x4, w4 + mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 + movrel x5, X(ff_vp9_subpel_filters), 256*\offset + add x6, x5, w6, uxtw #4 + mov x5, #\size +.if \size >= 8 + b \type\()_8tap_8v +.else + b \type\()_8tap_4v +.endif +endfunc +.endm + +.macro do_8tap_v_filters size, bpp +do_8tap_v_func put, regular, 1, \size, \bpp +do_8tap_v_func avg, regular, 1, \size, \bpp +do_8tap_v_func put, sharp, 2, \size, \bpp +do_8tap_v_func avg, sharp, 2, \size, \bpp +do_8tap_v_func put, smooth, 0, \size, \bpp +do_8tap_v_func avg, smooth, 0, \size, \bpp +.endm + +.macro do_8tap_v_filters_bpp bpp +do_8tap_v_filters 64, \bpp +do_8tap_v_filters 32, \bpp +do_8tap_v_filters 16, \bpp +do_8tap_v_filters 8, \bpp +do_8tap_v_filters 4, \bpp +.endm + +do_8tap_v_filters_bpp 10 +do_8tap_v_filters_bpp 12 diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S index 584c114269..f67624ca04 100644 --- a/libavcodec/aarch64/vp9mc_neon.S +++ b/libavcodec/aarch64/vp9mc_neon.S @@ -1,20 +1,20 @@ /* * Copyright (c) 2016 Google Inc. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -269,8 +269,7 @@ function \type\()_8tap_\size\()h_\idx1\idx2 sub x3, x3, #8 .endif // Load the filter vector - ld1 {v0.8b}, [x9] - sxtl v0.8h, v0.8b + ld1 {v0.8h}, [x9] 1: .if \size >= 16 mov x9, x5 @@ -384,9 +383,9 @@ do_8tap_h_size 16 .macro do_8tap_h_func type, filter, offset, size function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 - movrel x6, X(ff_vp9_subpel_filters), 120*\offset - 8 + movrel x6, X(ff_vp9_subpel_filters), 256*\offset cmp w5, #8 - add x9, x6, w5, uxtw #3 + add x9, x6, w5, uxtw #4 mov x5, #\size .if \size >= 16 b.ge \type\()_8tap_16h_34 @@ -516,8 +515,7 @@ do_8tap_h_filters 4 function \type\()_8tap_8v_\idx1\idx2 sub x2, x2, x3, lsl #1 sub x2, x2, x3 - ld1 {v0.8b}, [x6] - sxtl v0.8h, v0.8b + ld1 {v0.8h}, [x6] 1: .ifc \type,avg mov x7, x0 @@ -590,8 +588,7 @@ do_8tap_8v avg, 4, 3 function \type\()_8tap_4v_\idx1\idx2 sub x2, x2, x3, lsl #1 sub x2, x2, x3 - ld1 {v0.8b}, [x6] - sxtl v0.8h, v0.8b + ld1 {v0.8h}, [x6] .ifc \type,avg mov x7, x0 .endif @@ -660,9 +657,9 @@ do_8tap_4v avg, 4, 3 .macro do_8tap_v_func type, filter, offset, size function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 uxtw x4, w4 - movrel x5, X(ff_vp9_subpel_filters), 120*\offset - 8 + movrel x5, X(ff_vp9_subpel_filters), 256*\offset cmp w6, #8 - add x6, x5, w6, uxtw #3 + add x6, x5, w6, uxtw #4 mov x5, #\size .if \size >= 8 b.ge \type\()_8tap_8v_34 |