summaryrefslogtreecommitdiff
path: root/libavcodec/aarch64
diff options
context:
space:
mode:
authorSwinney, Jonathan <jswinney@amazon.com>2022-06-26 20:58:09 +0000
committerMartin Storsjö <martin@martin.st>2022-06-28 00:51:39 +0300
commitc471cc74747461ca166559c7b7fdfe030c3e3712 (patch)
tree488413070f1b5f7010aa751e64747f1ac416c721 /libavcodec/aarch64
parent20e2aa940cd521bb3b1395e7c7a28cc34059abee (diff)
downloadffmpeg-c471cc74747461ca166559c7b7fdfe030c3e3712.tar.gz
lavc/aarch64: motion estimation functions in neon
- ff_pix_abs16_neon - ff_pix_abs16_xy2_neon In direct micro benchmarks of these ff functions verses their C implementations, these functions performed as follows on AWS Graviton 3. ff_pix_abs16_neon: pix_abs_0_0_c: 141.1 pix_abs_0_0_neon: 19.6 ff_pix_abs16_xy2_neon: pix_abs_0_3_c: 269.1 pix_abs_0_3_neon: 39.3 Tested with: ./tests/checkasm/checkasm --test=motion --bench --disable-linux-perf Signed-off-by: Jonathan Swinney <jswinney@amazon.com> Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r--libavcodec/aarch64/Makefile2
-rw-r--r--libavcodec/aarch64/me_cmp_init_aarch64.c39
-rw-r--r--libavcodec/aarch64/me_cmp_neon.S205
3 files changed, 246 insertions, 0 deletions
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index c8935f205e..9ce21566c6 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_init_aarch64.o
@@ -47,6 +48,7 @@ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \
aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
new file mode 100644
index 0000000000..9fb63e9973
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideo.h"
+
+int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
+ ptrdiff_t stride, int h);
+
+av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->pix_abs[0][0] = ff_pix_abs16_neon;
+ c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
+ }
+}
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
new file mode 100644
index 0000000000..a7937bd8be
--- /dev/null
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_pix_abs16_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+ cmp w4, #4 // if h < 4, jump to completion section
+ movi v18.4S, #0 // clear result accumulator
+ b.lt 2f
+1:
+ ld1 {v0.16b}, [x1], x3 // load pix1
+ ld1 {v4.16b}, [x2], x3 // load pix2
+ ld1 {v1.16b}, [x1], x3 // load pix1
+ ld1 {v5.16b}, [x2], x3 // load pix2
+ uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabdl2 v17.8h, v0.16b, v4.16b
+ ld1 {v2.16b}, [x1], x3 // load pix1
+ ld1 {v6.16b}, [x2], x3 // load pix2
+ uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate
+ uabal2 v17.8h, v1.16b, v5.16b
+ ld1 {v3.16b}, [x1], x3
+ ld1 {v7.16b}, [x2], x3
+ uabal v16.8h, v2.8b, v6.8b
+ uabal2 v17.8h, v2.16b, v6.16b
+ sub w4, w4, #4 // h -= 4
+ uabal v16.8h, v3.8b, v7.8b
+ uabal2 v17.8h, v3.16b, v7.16b
+ cmp w4, #4 // if h >= 4, loop
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s16, v16.8h // add up everything in v16 accumulator
+ add d18, d16, d18 // add to the end result register
+
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain, jump to completion section
+
+ fmov w0, s18 // copy result to general purpose register
+ ret
+
+2:
+ ld1 {v0.16b}, [x1], x3 // load pix1
+ ld1 {v4.16b}, [x2], x3 // load pix2
+ uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate
+ uabal2 v16.8h, v0.16b, v4.16b
+ subs w4, w4, #1 // h -= 1
+ addv h16, v16.8h // add up v16
+ add d18, d16, d18 // add to result
+ b.ne 2b
+
+ fmov w0, s18 // copy result to general purpose register
+ ret
+endfunc
+
+function ff_pix_abs16_xy2_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *pix2
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ add x5, x2, x3 // use x5 to hold uint8_t *pix3
+ movi v0.2d, #0 // initialize the result register
+
+ // Load initial pix2 values for either the unrolled version or completion version.
+ ldur q4, [x2, #1] // load pix2+1
+ ldr q3, [x2] // load pix2
+ uaddl v2.8h, v4.8b, v3.8b // pix2 + pix2+1 0..7
+ uaddl2 v3.8h, v4.16b, v3.16b // pix2 + pix2+1 8..15
+ cmp w4, #4 // if h < 4 jump to the completion version
+ b.lt 2f
+1:
+ // This is an unrolled implementation. It completes 4 iterations of the C for each branch.
+ // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
+ // plus two at the beginning to start.
+ ldur q5, [x5, #1] // load pix3+1
+ ld1 {v4.16b}, [x5], x3 // load pix3
+ ld1 {v1.16b}, [x1], x3 // load pix1
+
+ ldur q7, [x5, #1] // load pix3+1
+ ld1 {v6.16b}, [x5], x3 // load pix3
+ ld1 {v16.16b}, [x1], x3 // load pix1
+
+ ldur q19, [x5, #1] // load pix3+1
+ ld1 {v18.16b}, [x5], x3 // load pix3
+ ld1 {v17.16b}, [x1], x3 // load pix1
+
+ ldur q22, [x5, #1] // load pix3+1
+ ld1 {v21.16b}, [x5], x3 // load pix3
+ ld1 {v20.16b}, [x1], x3 // load pix1
+
+ // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
+ uaddl v30.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
+ uaddl2 v31.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
+ add v23.8h, v2.8h, v30.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
+ add v24.8h, v3.8h, v31.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
+ rshrn v23.8b, v23.8h, #2 // shift right 2 0..7 (rounding shift right)
+ rshrn2 v23.16b, v24.8h, #2 // shift right 2 8..15
+
+ uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
+ uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
+ add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
+ add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right)
+ rshrn2 v26.16b, v27.8h, #2 // shift right 2 8..15
+
+ uaddl v4.8h, v18.8b, v19.8b // pix3 + pix3+1 0..7
+ uaddl2 v5.8h, v18.16b, v19.16b // pix3 + pix3+1 8..15
+ add v28.8h, v2.8h, v4.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
+ add v29.8h, v3.8h, v5.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
+ rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15
+
+ uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7
+ uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15
+ add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
+ add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
+ rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
+ rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15
+
+ // Averages are now stored in these registers:
+ // v23, v16, v28, v30
+ // pix1 values in these registers:
+ // v1, v16, v17, v20
+ // available:
+ // v4, v5, v7, v18, v19, v24, v25, v27, v29, v31
+
+ sub w4, w4, #4 // h -= 4
+
+ // Using absolute-difference instructions instead of absolute-difference-accumulate allows
+ // us to keep the results in 16b vectors instead of widening values with twice the instructions.
+ // This approach also has fewer data dependencies, allowing better instruction level parallelism.
+ uabd v4.16b, v1.16b, v23.16b // absolute difference 0..15, i=0
+ uabd v5.16b, v16.16b, v26.16b // absolute difference 0..15, i=1
+ uabd v6.16b, v17.16b, v28.16b // absolute difference 0..15, i=2
+ uabd v7.16b, v20.16b, v30.16b // absolute difference 0..15, i=3
+
+ cmp w4, #4 // loop if h >= 4
+
+ // Now add up all the values in each vector, v4-v7 with widening adds
+ uaddl v19.8h, v4.8b, v5.8b
+ uaddl2 v18.8h, v4.16b, v5.16b
+ uaddl v4.8h, v6.8b, v7.8b
+ uaddl2 v5.8h, v6.16b, v7.16b
+ add v4.8h, v4.8h, v5.8h
+ add v4.8h, v4.8h, v18.8h
+ add v4.8h, v4.8h, v19.8h
+ uaddlv s4, v4.8h // finish adding up accumulated values
+ add d0, d0, d4 // add the value to the top level accumulator
+
+ b.ge 1b
+ cbnz w4, 2f // if iterations remain jump to completion section
+
+ fmov w0, s0 // copy result to general purpose register
+ ret
+2:
+ // v2 and v3 are set either at the end of this loop or at from the unrolled version
+ // which branches here to complete iterations when h % 4 != 0.
+ ldur q5, [x5, #1] // load pix3+1
+ ld1 {v4.16b}, [x5], x3 // load pix3
+ ld1 {v1.16b}, [x1], x3 // load pix1
+ subs w4, w4, #1 // decrement h
+
+ uaddl v18.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
+ uaddl2 v19.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
+ add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
+ add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
+ // divide by 4 to compute the average of values summed above
+ urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
+ urshr v17.8h, v17.8h, #2 // shift right by 2 8..15
+
+ uxtl2 v8.8h, v1.16b // 8->16 bits pix1 8..15
+ uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7
+
+ uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7
+ uaba v6.8h, v8.8h, v17.8h // absolute difference accumulate 8..15
+ mov v2.16b, v18.16b // pix3 -> pix2
+ mov v3.16b, v19.16b // pix3+1 -> pix2+1
+ uaddlv s6, v6.8h // add up accumulator in v6
+ add d0, d0, d6 // add to the final result
+
+ b.ne 2b // loop if h > 0
+ fmov w0, s0 // copy result to general purpose register
+ ret
+endfunc