summaryrefslogtreecommitdiff
path: root/chromium/third_party/dav1d/libdav1d/src/arm
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/arm')
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S239
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S24
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S320
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S39
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S39
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/asm.S6
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/cdef.h88
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h204
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/ipred.h80
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/itx.h141
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h45
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h265
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/mc.h114
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h (renamed from chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c)2
14 files changed, 1484 insertions, 122 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S
index db8ecffe6ea..aa6c272e718 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S
@@ -668,12 +668,21 @@ def_fn_4x4 identity, flipadst
.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
idct_4s_x4 \r0, \r2, \r4, \r6
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a
- vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a
+ vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a
vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a
vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a
vrshr.s32 \r1, q2, #12 // t4a
- vrshr.s32 \r7, q4, #12 // t7a
+ vrshr.s32 \r7, q3, #12 // t7a
vrshr.s32 \r3, q6, #12 // t5a
vrshr.s32 \r5, q7, #12 // t6a
@@ -682,17 +691,24 @@ def_fn_4x4 identity, flipadst
vqadd.s32 q3, \r7, \r5 // t7
vqsub.s32 \r3, \r7, \r5 // t6a
- vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5
+.irp r, q2, \r1, q3, \r3
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q2, \r1, q3, \r3
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5
vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6
- vrshr.s32 q4, q4, #12 // t5
+ vrshr.s32 q7, q7, #12 // t5
vrshr.s32 q5, q6, #12 // t6
vqsub.s32 \r7, \r0, q3 // out7
vqadd.s32 \r0, \r0, q3 // out0
vqadd.s32 \r1, \r2, q5 // out1
vqsub.s32 q6, \r2, q5 // out6
- vqadd.s32 \r2, \r4, q4 // out2
- vqsub.s32 \r5, \r4, q4 // out5
+ vqadd.s32 \r2, \r4, q7 // out2
+ vqsub.s32 \r5, \r4, q7 // out5
vqadd.s32 \r3, \r6, q2 // out3
vqsub.s32 \r4, \r6, q2 // out4
vmov \r6, q6 // out6
@@ -701,6 +717,15 @@ def_fn_4x4 identity, flipadst
.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7
idct_2s_x4 \r0, \r2, \r4, \r6
+ vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a
vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a
vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a
@@ -715,6 +740,13 @@ def_fn_4x4 identity, flipadst
vqadd.s32 d5, \r7, \r5 // t7
vqsub.s32 \r3, \r7, \r5 // t6a
+.irp r, d4, \r1, d5, \r3
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, \r1, d5, \r3
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5
vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6
vrshr.s32 d6, d6, #12 // t5
@@ -763,19 +795,28 @@ endfunc
vqadd.s32 q2, q8, q12 // t0
vqsub.s32 q3, q8, q12 // t4
+ vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vqadd.s32 q4, q15, q11 // t1
vqsub.s32 q5, q15, q11 // t5
vqadd.s32 q6, q10, q14 // t2
vqsub.s32 q7, q10, q14 // t6
+ vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqadd.s32 q10, q13, q9 // t3
vqsub.s32 q11, q13, q9 // t7
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmax.s32 \r, \r, q14
+.endr
+
vmul_vmla q8, q3, q5, d1[1], d1[0]
- vmul_vmls q12, q3, q5, d1[0], d1[1]
+ vmul_vmls q13, q3, q5, d1[0], d1[1]
vmul_vmls q14, q11, q7, d1[1], d1[0]
vrshr.s32 q3, q8, #12 // t4a
- vrshr.s32 q5, q12, #12 // t5a
+ vrshr.s32 q5, q13, #12 // t5a
vmul_vmla q8, q11, q7, d1[0], d1[1]
@@ -786,12 +827,24 @@ endfunc
vqsub.s32 q2, q2, q6 // t2
vqadd.s32 \r7, q4, q10 // out7
vqsub.s32 q4, q4, q10 // t3
- vqneg.s32 \r7, \r7 // out7
+
+ vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqadd.s32 \r1, q3, q7 // out1
vqsub.s32 q3, q3, q7 // t6
vqadd.s32 \r6, q5, q11 // out6
vqsub.s32 q5, q5, q11 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, q2, q4, q3, q5
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q4, q3, q5
+ vmax.s32 \r, \r, q10
+.endr
+
+ vqneg.s32 \r7, \r7 // out7
vqneg.s32 \r1, \r1 // out1
vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12)
@@ -1068,6 +1121,14 @@ function inv_dct_2s_x16_neon
idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30
+ // idct_8 leaves the row_clip_max/min constants in d9 and d8
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmax.s32 \r, \r, d8
+.endr
+
vld1.32 {q0, q1}, [r12, :128]
sub r12, r12, #32
@@ -1099,6 +1160,13 @@ function inv_dct_2s_x16_neon
vqadd.s32 d25, d29, d27 // t12
vqsub.s32 d29, d29, d27 // t13
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a
vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a
vrshr.s32 d21, d6, #12 // t9a
@@ -1119,6 +1187,13 @@ function inv_dct_2s_x16_neon
vqsub.s32 d25, d27, d29 // t13
vqadd.s32 d27, d27, d29 // t14
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11
vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12
vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a
@@ -1193,6 +1268,9 @@ endfunc
vld1.32 {q0, q1}, [r12, :128]
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
vqsub.s32 d5, d16, d23 // t8a
vqadd.s32 d16, d16, d23 // t0a
vqsub.s32 d7, d31, d24 // t9a
@@ -1210,6 +1288,13 @@ endfunc
vqadd.s32 d28, d25, d30 // t7a
vqsub.s32 d25, d25, d30 // t15a
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8
vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9
vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10
@@ -1244,6 +1329,13 @@ endfunc
vqadd.s32 d20, d29, d22 // t11a
vqsub.s32 d29, d29, d22 // t15a
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a
vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a
vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a
@@ -1272,24 +1364,34 @@ endfunc
vqadd.s32 \o15,d31, d26 // out15
vmov \o0, d4
.endif
- vqneg.s32 \o15, \o15 // out15
vqsub.s32 d3, d29, d18 // t15a
vqadd.s32 \o13,d29, d18 // out13
vqadd.s32 \o2, d17, d30 // out2
vqsub.s32 d26, d17, d30 // t14a
- vqneg.s32 \o13,\o13 // out13
vqadd.s32 \o1, d19, d27 // out1
vqsub.s32 d27, d19, d27 // t10
vqadd.s32 \o14,d28, d20 // out14
vqsub.s32 d20, d28, d20 // t11
- vqneg.s32 \o1, \o1 // out1
vqadd.s32 \o3, d22, d24 // out3
vqsub.s32 d22, d22, d24 // t6
vqadd.s32 \o12,d25, d23 // out12
vqsub.s32 d23, d25, d23 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmax.s32 \r, \r, d10
+.endr
+
+ vqneg.s32 \o15, \o15 // out15
+ vqneg.s32 \o13,\o13 // out13
+ vqneg.s32 \o1, \o1 // out1
vqneg.s32 \o3, \o3 // out3
vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
@@ -1947,6 +2049,9 @@ function inv_dct32_odd_2s_x16_neon
vld1.32 {q0, q1}, [r12, :128]
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
vqsub.s32 d5, d16, d24 // t17
vqadd.s32 d16, d16, d24 // t16
vqsub.s32 d7, d31, d23 // t30
@@ -1964,6 +2069,13 @@ function inv_dct32_odd_2s_x16_neon
vqadd.s32 d25, d19, d27 // t28
vqsub.s32 d19, d19, d27 // t29
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a
vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a
vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a
@@ -2000,6 +2112,13 @@ function inv_dct32_odd_2s_x16_neon
vqsub.s32 d29, d31, d25 // t28a
vqadd.s32 d31, d31, d25 // t31a
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a
vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a
vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19
@@ -2037,6 +2156,13 @@ function inv_dct32_odd_2s_x16_neon
vqsub.s32 d24, d24, d19 // t27a
vmov d19, d4 // out19
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20
vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27
vrshr.s32 d20, d4, #12 // t20
@@ -2081,6 +2207,18 @@ function inv_txfm_horz\suffix\()_dct_32x2_neon
scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
.endif
bl inv_dct_2s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
vtrn.32 d16, d17
vtrn.32 d18, d19
vtrn.32 d20, d21
@@ -2745,14 +2883,21 @@ function inv_dct64_step1_neon
vqsub.s32 d30, d23, d22 // t62
vqadd.s32 d31, d23, d22 // t63
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a
vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a
vneg.s32 d4, d4 // t34a
- vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a
+ vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a
vrshr.s32 d26, d4, #12 // t34a
vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a
vrshr.s32 d29, d6, #12 // t61a
- vrshr.s32 d25, d8, #12 // t33a
+ vrshr.s32 d25, d7, #12 // t33a
vrshr.s32 d30, d4, #12 // t62a
vqadd.s32 d16, d24, d27 // t32a
@@ -2764,13 +2909,20 @@ function inv_dct64_step1_neon
vqsub.s32 d21, d30, d29 // t61
vqadd.s32 d22, d30, d29 // t62
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a
vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a
- vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60
+ vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60
vrshr.s32 d21, d4, #12 // t61a
vrshr.s32 d18, d6, #12 // t34a
vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35
- vrshr.s32 d20, d8, #12 // t60
+ vrshr.s32 d20, d7, #12 // t60
vrshr.s32 d19, d4, #12 // t35
vst1.32 {d16, d17, d18, d19}, [r6, :128]!
@@ -2805,14 +2957,21 @@ function inv_dct64_step2_neon
vqadd.s32 d30, d23, d22 // t48
vqsub.s32 d31, d23, d22 // t55
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a
vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a
- vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a
+ vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a
vrshr.s32 d25, d4, #12 // t56a
vrshr.s32 d27, d6, #12 // t39a
- vneg.s32 d8, d8 // t40a
+ vneg.s32 d7, d7 // t40a
vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a
- vrshr.s32 d31, d8, #12 // t40a
+ vrshr.s32 d31, d7, #12 // t40a
vrshr.s32 d28, d4, #12 // t55a
vqadd.s32 d16, d24, d29 // t32a
@@ -2824,13 +2983,20 @@ function inv_dct64_step2_neon
vqsub.s32 d21, d25, d28 // t55
vqadd.s32 d22, d25, d28 // t56
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a
vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a
- vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47
+ vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47
vrshr.s32 d18, d4, #12 // t40a
vrshr.s32 d21, d6, #12 // t55a
vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48
- vrshr.s32 d19, d8, #12 // t47
+ vrshr.s32 d19, d7, #12 // t47
vrshr.s32 d20, d4, #12 // t48
vstr d16, [r6, #4*2*0] // t32a
@@ -2916,6 +3082,17 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
bl inv_dct_2s_x16_neon
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
store16 r6
movdup_if d0, r12, 2896*8*(1<<16), \scale
@@ -2934,6 +3111,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
mov r9, #-8
+ vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
.macro store_addsub r0, r1, r2, r3
vld1.32 {d2}, [r6, :64]!
vld1.32 {d3}, [r6, :64]!
@@ -2942,16 +3121,32 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
vld1.32 {d4}, [r6, :64]!
vqadd.s32 d7, d3, \r1
vqsub.s32 \r1, d3, \r1
+ vmin.s32 d6, d6, d1
+ vmin.s32 \r0, \r0, d1
vld1.32 {d5}, [r6, :64]!
vqadd.s32 d2, d4, \r2
sub r6, r6, #8*4
+ vmax.s32 d6, d6, d0
+ vmax.s32 \r0, \r0, d0
vqsub.s32 \r2, d4, \r2
+ vmin.s32 d7, d7, d1
+ vmin.s32 \r1, \r1, d1
vst1.32 {d6}, [r6, :64]!
vst1.32 {\r0}, [r10, :64], r9
+ vmin.s32 d2, d2, d1
+ vmin.s32 \r2, \r2, d1
+ vmax.s32 d7, d7, d0
+ vmax.s32 \r1, \r1, d0
vqadd.s32 d3, d5, \r3
vqsub.s32 \r3, d5, \r3
+ vmax.s32 d2, d2, d0
+ vmax.s32 \r2, \r2, d0
+ vmin.s32 d3, d3, d1
+ vmin.s32 \r3, \r3, d1
vst1.32 {d7}, [r6, :64]!
vst1.32 {\r1}, [r10, :64], r9
+ vmax.s32 d3, d3, d0
+ vmax.s32 \r3, \r3, d0
vst1.32 {d2}, [r6, :64]!
vst1.32 {\r2}, [r10, :64], r9
vst1.32 {d3}, [r6, :64]!
@@ -2966,6 +3161,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
add r6, r6, #2*4*16
movrel_local r12, idct64_coeffs
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
movdup_if d0, lr, 2896*8*(1<<16), \scale
vmov_if d7, #0, \clear
add r9, r7, r8, lsl #4 // offset 16
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S
index c9650e9d544..b1b2f8fe659 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S
@@ -483,10 +483,10 @@ endfunc
add \o1\().4s, v5.4s, v7.4s
sub \o3\().4s, \o3\().4s, v7.4s
- rshrn \o0\().4h, \o0\().4s, #12
- rshrn \o2\().4h, \o2\().4s, #12
- rshrn \o1\().4h, \o1\().4s, #12
- rshrn \o3\().4h, \o3\().4s, #12
+ sqrshrn \o0\().4h, \o0\().4s, #12
+ sqrshrn \o2\().4h, \o2\().4s, #12
+ sqrshrn \o1\().4h, \o1\().4s, #12
+ sqrshrn \o3\().4h, \o3\().4s, #12
.endm
function inv_adst_4h_x4_neon, export=1
@@ -538,21 +538,21 @@ endfunc
sub v4.4s, v4.4s, v2.4s // out3
sub v5.4s, v5.4s, v3.4s
- rshrn v18.4h, v18.4s, #12
- rshrn2 v18.8h, v19.4s, #12
+ sqrshrn v18.4h, v18.4s, #12
+ sqrshrn2 v18.8h, v19.4s, #12
- rshrn \o0\().4h, v16.4s, #12
- rshrn2 \o0\().8h, v17.4s, #12
+ sqrshrn \o0\().4h, v16.4s, #12
+ sqrshrn2 \o0\().8h, v17.4s, #12
.ifc \o2, v17
mov v17.16b, v18.16b
.endif
- rshrn \o1\().4h, v6.4s, #12
- rshrn2 \o1\().8h, v7.4s, #12
+ sqrshrn \o1\().4h, v6.4s, #12
+ sqrshrn2 \o1\().8h, v7.4s, #12
- rshrn \o3\().4h, v4.4s, #12
- rshrn2 \o3\().8h, v5.4s, #12
+ sqrshrn \o3\().4h, v4.4s, #12
+ sqrshrn2 \o3\().8h, v5.4s, #12
.endm
function inv_adst_8h_x4_neon, export=1
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
index 0a0c7768b13..eee3a9636de 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
@@ -124,6 +124,13 @@ endconst
.endif
.endm
+.macro smin_4s r0, r1, r2
+ smin \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+.macro smax_4s r0, r1, r2
+ smax \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+
.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
@@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst
.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
idct_4 \r0, \r2, \r4, \r6
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ smin_4s \r, \r, v5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ smax_4s \r, \r, v4
+.endr
+
mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
- mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
+ mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
srshr \r1\().4s, v2.4s, #12 // t4a
- srshr \r7\().4s, v4.4s, #12 // t7a
+ srshr \r7\().4s, v3.4s, #12 // t7a
srshr \r3\().4s, v6.4s, #12 // t5a
srshr \r5\().4s, v7.4s, #12 // t6a
@@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst
sqadd v3.4s, \r7\().4s, \r5\().4s // t7
sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
- mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5
+.irp r, v2, \r1, v3, \r3
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, \r1, v3, \r3
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5
mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
- srshr v4.4s, v4.4s, #12 // t5
- srshr v5.4s, v6.4s, #12 // t6
+ srshr v7.4s, v7.4s, #12 // t5
+ srshr v6.4s, v6.4s, #12 // t6
sqsub \r7\().4s, \r0\().4s, v3.4s // out7
sqadd \r0\().4s, \r0\().4s, v3.4s // out0
- sqadd \r1\().4s, \r2\().4s, v5.4s // out1
- sqsub v6.4s, \r2\().4s, v5.4s // out6
- sqadd \r2\().4s, \r4\().4s, v4.4s // out2
- sqsub \r5\().4s, \r4\().4s, v4.4s // out5
+ sqadd \r1\().4s, \r2\().4s, v6.4s // out1
+ sqsub v6.4s, \r2\().4s, v6.4s // out6
+ sqadd \r2\().4s, \r4\().4s, v7.4s // out2
+ sqsub \r5\().4s, \r4\().4s, v7.4s // out5
sqadd \r3\().4s, \r6\().4s, v2.4s // out3
sqsub \r4\().4s, \r6\().4s, v2.4s // out4
mov \r6\().16b, v6.16b // out6
@@ -660,8 +683,11 @@ endfunc
ld1 {v0.4s}, [x16]
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+
sqadd v2.4s, v16.4s, v20.4s // t0
sqsub v3.4s, v16.4s, v20.4s // t4
+ mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd v4.4s, v23.4s, v19.4s // t1
sqsub v5.4s, v23.4s, v19.4s // t5
sqadd v6.4s, v18.4s, v22.4s // t2
@@ -669,6 +695,13 @@ endfunc
sqadd v18.4s, v21.4s, v17.4s // t3
sqsub v19.4s, v21.4s, v17.4s // t7
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smax_4s \r, \r, v20
+.endr
+
mul_mla v16, v3, v5, v0.s[3], v0.s[2]
mul_mls v20, v3, v5, v0.s[2], v0.s[3]
mul_mls v22, v19, v7, v0.s[3], v0.s[2]
@@ -685,12 +718,24 @@ endfunc
sqsub v2.4s, v2.4s, v6.4s // t2
sqadd \o7\().4s, v4.4s, v18.4s // out7
sqsub v4.4s, v4.4s, v18.4s // t3
- sqneg \o7\().4s, \o7\().4s // out7
+
+ mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd \o1\().4s, v3.4s, v7.4s // out1
sqsub v3.4s, v3.4s, v7.4s // t6
sqadd \o6\().4s, v5.4s, v19.4s // out6
sqsub v5.4s, v5.4s, v19.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v4, v3, v5
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v4, v3, v5
+ smax_4s \r, \r, v18
+.endr
+
+ sqneg \o7\().4s, \o7\().4s // out7
sqneg \o1\().4s, \o1\().4s // out1
mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
@@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon
idct_8 v16, v18, v20, v22, v24, v26, v28, v30
+ // idct_8 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
ld1 {v0.4s, v1.4s}, [x16]
sub x16, x16, #32
mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
- mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a
+ mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a
mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
srshr v17.4s, v2.4s, #12 // t8a
- srshr v31.4s, v4.4s, #12 // t15a
+ srshr v31.4s, v3.4s, #12 // t15a
mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
- mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a
+ mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a
srshr v23.4s, v6.4s, #12 // t9a
srshr v25.4s, v2.4s, #12 // t14a
mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
- srshr v21.4s, v4.4s, #12 // t10a
+ srshr v21.4s, v3.4s, #12 // t10a
srshr v27.4s, v6.4s, #12 // t13a
- mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a
+ mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a
srshr v19.4s, v2.4s, #12 // t11a
- srshr v29.4s, v4.4s, #12 // t12a
+ srshr v29.4s, v3.4s, #12 // t12a
ld1 {v0.4s}, [x16]
@@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon
sqadd v25.4s, v29.4s, v27.4s // t12
sqsub v29.4s, v29.4s, v27.4s // t13
- mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a
mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
- srshr v21.4s, v4.4s, #12 // t9a
+ srshr v21.4s, v7.4s, #12 // t9a
srshr v27.4s, v6.4s, #12 // t14a
- mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a
+ mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a
mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
- srshr v29.4s, v4.4s, #12 // t13a
+ srshr v29.4s, v7.4s, #12 // t13a
neg v6.4s, v6.4s
srshr v23.4s, v6.4s, #12 // t10a
@@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon
sqsub v25.4s, v27.4s, v29.4s // t13
sqadd v27.4s, v27.4s, v29.4s // t14
- mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
- srshr v4.4s, v4.4s, #12 // t11
- srshr v5.4s, v6.4s, #12 // t12
- mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a
+ srshr v7.4s, v7.4s, #12 // t11
+ srshr v6.4s, v6.4s, #12 // t12
+ mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a
srshr v2.4s, v2.4s, #12 // t10a
- srshr v3.4s, v6.4s, #12 // t13a
+ srshr v3.4s, v3.4s, #12 // t13a
- sqadd v6.4s, v16.4s, v31.4s // out0
+ sqadd v1.4s, v16.4s, v31.4s // out0
sqsub v31.4s, v16.4s, v31.4s // out15
- mov v16.16b, v6.16b
+ mov v16.16b, v1.16b
sqadd v23.4s, v30.4s, v17.4s // out7
- sqsub v7.4s, v30.4s, v17.4s // out8
+ sqsub v1.4s, v30.4s, v17.4s // out8
sqadd v17.4s, v18.4s, v27.4s // out1
sqsub v30.4s, v18.4s, v27.4s // out14
sqadd v18.4s, v20.4s, v3.4s // out2
sqsub v29.4s, v20.4s, v3.4s // out13
sqadd v3.4s, v28.4s, v19.4s // out6
sqsub v25.4s, v28.4s, v19.4s // out9
- sqadd v19.4s, v22.4s, v5.4s // out3
- sqsub v28.4s, v22.4s, v5.4s // out12
- sqadd v20.4s, v24.4s, v4.4s // out4
- sqsub v27.4s, v24.4s, v4.4s // out11
+ sqadd v19.4s, v22.4s, v6.4s // out3
+ sqsub v28.4s, v22.4s, v6.4s // out12
+ sqadd v20.4s, v24.4s, v7.4s // out4
+ sqsub v27.4s, v24.4s, v7.4s // out11
sqadd v21.4s, v26.4s, v2.4s // out5
sqsub v26.4s, v26.4s, v2.4s // out10
- mov v24.16b, v7.16b
+ mov v24.16b, v1.16b
mov v22.16b, v3.16b
ret
@@ -1084,6 +1151,9 @@ endfunc
ld1 {v0.4s, v1.4s}, [x16]
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
sqsub v2.4s, v16.4s, v23.4s // t8a
sqadd v16.4s, v16.4s, v23.4s // t0a
sqsub v3.4s, v31.4s, v24.4s // t9a
@@ -1101,6 +1171,13 @@ endfunc
sqadd v28.4s, v25.4s, v30.4s // t7a
sqsub v25.4s, v25.4s, v30.4s // t15a
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smax_4s \r, \r, v7
+.endr
+
mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
@@ -1135,6 +1212,13 @@ endfunc
sqadd v20.4s, v29.4s, v22.4s // t11a
sqsub v29.4s, v29.4s, v22.4s // t15a
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smax_4s \r, \r, v7
+.endr
+
mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
@@ -1163,24 +1247,34 @@ endfunc
sqadd \o15\().4s, v31.4s, v26.4s // out15
mov \o0\().16b, v4.16b
.endif
- sqneg \o15\().4s, \o15\().4s // out15
sqsub v3.4s, v29.4s, v18.4s // t15a
sqadd \o13\().4s, v29.4s, v18.4s // out13
sqadd \o2\().4s, v17.4s, v30.4s // out2
sqsub v26.4s, v17.4s, v30.4s // t14a
- sqneg \o13\().4s, \o13\().4s // out13
sqadd \o1\().4s, v19.4s, v27.4s // out1
sqsub v27.4s, v19.4s, v27.4s // t10
sqadd \o14\().4s, v28.4s, v20.4s // out14
sqsub v20.4s, v28.4s, v20.4s // t11
- sqneg \o1\().4s, \o1\().4s // out1
sqadd \o3\().4s, v22.4s, v24.4s // out3
sqsub v22.4s, v22.4s, v24.4s // t6
sqadd \o12\().4s, v25.4s, v23.4s // out12
sqsub v23.4s, v25.4s, v23.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smax_4s \r, \r, v7
+.endr
+
+ sqneg \o15\().4s, \o15\().4s // out15
+ sqneg \o13\().4s, \o13\().4s // out13
+ sqneg \o1\().4s, \o1\().4s // out1
sqneg \o3\().4s, \o3\().4s // out3
mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
@@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon
ld1 {v0.4s, v1.4s}, [x16]
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
sqsub v2.4s, v16.4s, v24.4s // t17
sqadd v16.4s, v16.4s, v24.4s // t16
sqsub v3.4s, v31.4s, v23.4s // t30
@@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon
sqadd v25.4s, v19.4s, v27.4s // t28
sqsub v19.4s, v19.4s, v27.4s // t29
- mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a
mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
- srshr v21.4s, v4.4s, #12 // t17a
+ srshr v21.4s, v7.4s, #12 // t17a
srshr v27.4s, v6.4s, #12 // t30a
neg v2.4s, v2.4s // -> t18a
- mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a
+ mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a
mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
srshr v19.4s, v2.4s, #12 // t18a
- srshr v24.4s, v4.4s, #12 // t29a
+ srshr v24.4s, v7.4s, #12 // t29a
mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
- mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a
+ mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a
srshr v22.4s, v6.4s, #12 // t21a
srshr v18.4s, v2.4s, #12 // t26a
- neg v4.4s, v4.4s // -> t22a
+ neg v7.4s, v7.4s // -> t22a
mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
- srshr v17.4s, v4.4s, #12 // t22a
+ srshr v17.4s, v7.4s, #12 // t22a
srshr v20.4s, v6.4s, #12 // t25a
sqsub v2.4s, v27.4s, v24.4s // t29
@@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon
sqsub v29.4s, v31.4s, v25.4s // t28a
sqadd v31.4s, v31.4s, v25.4s // t31a
- mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a
mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
- srshr v18.4s, v4.4s, #12 // t18a
+ srshr v18.4s, v7.4s, #12 // t18a
srshr v25.4s, v6.4s, #12 // t29a
- mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28
+ mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28
mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
srshr v29.4s, v2.4s, #12 // t19
- srshr v24.4s, v4.4s, #12 // t28
+ srshr v24.4s, v7.4s, #12 // t28
neg v6.4s, v6.4s // -> t20
mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
- mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a
+ mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a
srshr v26.4s, v6.4s, #12 // t20
srshr v19.4s, v2.4s, #12 // t27
- neg v4.4s, v4.4s // -> t21a
+ neg v7.4s, v7.4s // -> t21a
mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
- srshr v20.4s, v4.4s, #12 // t21a
+ srshr v20.4s, v7.4s, #12 // t21a
srshr v28.4s, v6.4s, #12 // t26a
sqsub v2.4s, v16.4s, v30.4s // t23
@@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon
sqsub v21.4s, v27.4s, v22.4s // t25a
sqsub v27.4s, v18.4s, v20.4s // t21
sqadd v18.4s, v18.4s, v20.4s // t18 = out18
- sqadd v4.4s, v29.4s, v26.4s // t19a = out19
+ sqadd v7.4s, v29.4s, v26.4s // t19a = out19
sqsub v26.4s, v29.4s, v26.4s // t20a
sqadd v29.4s, v25.4s, v28.4s // t29 = out29
sqsub v25.4s, v25.4s, v28.4s // t26
sqadd v28.4s, v24.4s, v19.4s // t28a = out28
sqsub v24.4s, v24.4s, v19.4s // t27a
- mov v19.16b, v4.16b // out19
+ mov v19.16b, v7.16b // out19
- mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20
mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
- srshr v20.4s, v4.4s, #12 // t20
+ srshr v20.4s, v7.4s, #12 // t20
srshr v22.4s, v6.4s, #12 // t27
- mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a
+ mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a
mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
mov v27.16b, v22.16b // t27
- srshr v26.4s, v4.4s, #12 // t26a
+ srshr v26.4s, v7.4s, #12 // t26a
mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
- mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25
+ mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25
srshr v21.4s, v6.4s, #12 // t21a
srshr v22.4s, v24.4s, #12 // t22
- srshr v25.4s, v4.4s, #12 // t25
+ srshr v25.4s, v7.4s, #12 // t25
- mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
- srshr v23.4s, v4.4s, #12 // t23a
+ srshr v23.4s, v7.4s, #12 // t23a
srshr v24.4s, v6.4s, #12 // t24a
ret
@@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon
scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
.endif
bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
@@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon
sqsub v30.4s, v23.4s, v22.4s // t62
sqadd v31.4s, v23.4s, v22.4s // t63
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
- mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a
+ mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a
neg v2.4s, v2.4s // t34a
mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
srshr v26.4s, v2.4s, #12 // t34a
mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
- srshr v29.4s, v4.4s, #12 // t61a
+ srshr v29.4s, v7.4s, #12 // t61a
srshr v25.4s, v6.4s, #12 // t33a
srshr v30.4s, v2.4s, #12 // t62a
@@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon
sqsub v21.4s, v30.4s, v29.4s // t61
sqadd v22.4s, v30.4s, v29.4s // t62
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
- mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a
+ mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a
mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
srshr v21.4s, v2.4s, #12 // t61a
- srshr v18.4s, v4.4s, #12 // t34a
+ srshr v18.4s, v7.4s, #12 // t34a
mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
srshr v20.4s, v6.4s, #12 // t60
srshr v19.4s, v2.4s, #12 // t35
@@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon
sqadd v30.4s, v23.4s, v22.4s // t48
sqsub v31.4s, v23.4s, v22.4s // t55
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
- mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a
+ mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a
mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
srshr v25.4s, v2.4s, #12 // t56a
- srshr v27.4s, v4.4s, #12 // t39a
+ srshr v27.4s, v7.4s, #12 // t39a
neg v6.4s, v6.4s // t40a
mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
srshr v31.4s, v6.4s, #12 // t40a
@@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon
sqsub v21.4s, v25.4s, v28.4s // t55
sqadd v22.4s, v25.4s, v28.4s // t56
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
- mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a
+ mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a
mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
srshr v18.4s, v2.4s, #12 // t40a
- srshr v21.4s, v4.4s, #12 // t55a
+ srshr v21.4s, v7.4s, #12 // t55a
mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
srshr v19.4s, v6.4s, #12 // t47
srshr v20.4s, v2.4s, #12 // t48
@@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
bl inv_dct_4s_x16_neon
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
store16 x6
movz16dup_if v0.2s, w16, #2896*8, \scale
@@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
mov x9, #-16
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
.macro store_addsub r0, r1, r2, r3
ld1 {v2.4s}, [x6], #16
ld1 {v3.4s}, [x6], #16
@@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
ld1 {v4.4s}, [x6], #16
sqadd v7.4s, v3.4s, \r1
sqsub \r1, v3.4s, \r1
+ smin v6.4s, v6.4s, v1.4s
+ smin \r0, \r0, v1.4s
ld1 {v5.4s}, [x6], #16
sqadd v2.4s, v4.4s, \r2
sub x6, x6, #16*4
+ smax v6.4s, v6.4s, v0.4s
+ smax \r0, \r0, v0.4s
sqsub \r2, v4.4s, \r2
+ smin v7.4s, v7.4s, v1.4s
+ smin \r1, \r1, v1.4s
st1 {v6.4s}, [x6], #16
st1 {\r0}, [x10], x9
+ smin v2.4s, v2.4s, v1.4s
+ smin \r2, \r2, v1.4s
+ smax v7.4s, v7.4s, v0.4s
+ smax \r1, \r1, v0.4s
sqadd v3.4s, v5.4s, \r3
sqsub \r3, v5.4s, \r3
+ smax v2.4s, v2.4s, v0.4s
+ smax \r2, \r2, v0.4s
+ smin v3.4s, v3.4s, v1.4s
+ smin \r3, \r3, v1.4s
st1 {v7.4s}, [x6], #16
st1 {\r1}, [x10], x9
+ smax v3.4s, v3.4s, v0.4s
+ smax \r3, \r3, v0.4s
st1 {v2.4s}, [x6], #16
st1 {\r2}, [x10], x9
st1 {v3.4s}, [x6], #16
@@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
add x6, x6, #4*4*16
movrel x17, idct64_coeffs
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
movz16dup_if v0.2s, w16, #2896*8, \scale
movi_if v7.4s, #0, \clear
add x9, x7, x8, lsl #4 // offset 16
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S
index 2b9b5c408ec..63d5de10ada 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S
@@ -28,6 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
.macro loop_filter wd
function lpf_16_wd\wd\()_neon
uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0)
@@ -77,8 +82,10 @@ function lpf_16_wd\wd\()_neon
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
- b.eq 9f // if (!fm || wd < 4) return;
-
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
.if \wd >= 6
movi v10.16b, #1
uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0)
@@ -474,20 +481,20 @@ function lpf_16_wd\wd\()_neon
bif v11.16b, v29.16b, v15.16b // out q5
.endif
+ mov x14, #0
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
- ret x13
+ mov x14, #(1 << 6)
+ ret
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
- ret x14
+ mov x14, #(1 << 4)
+ ret
.endif
-9:
- // Return directly without writing back any pixels
- ret x15
endfunc
.endm
@@ -497,22 +504,34 @@ loop_filter 6
loop_filter 4
.macro lpf_16_wd16
- adr x13, 7f
- adr x14, 8f
bl lpf_16_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_16_wd8
- adr x14, 8f
bl lpf_16_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_16_wd6
bl lpf_16_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
.macro lpf_16_wd4
bl lpf_16_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
function lpf_v_4_16_neon
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S
index aab0230c44b..d181a3e6239 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S
@@ -28,6 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
.macro loop_filter wd
function lpf_8_wd\wd\()_neon
uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
@@ -77,8 +82,10 @@ function lpf_8_wd\wd\()_neon
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
- b.eq 9f // if (!fm || wd < 4) return;
-
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
.if \wd >= 6
movi v10.8h, #1
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
@@ -360,20 +367,20 @@ function lpf_8_wd\wd\()_neon
bif v11.16b, v29.16b, v15.16b // out q5
.endif
+ mov x14, #0
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
- ret x13
+ mov x14, #(1 << 6)
+ ret
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
- ret x14
+ mov x14, #(1 << 4)
+ ret
.endif
-9:
- // Return directly without writing back any pixels
- ret x15
endfunc
.endm
@@ -383,22 +390,34 @@ loop_filter 6
loop_filter 4
.macro lpf_8_wd16
- adr x13, 7f
- adr x14, 8f
bl lpf_8_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_8_wd8
- adr x14, 8f
bl lpf_8_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_8_wd6
bl lpf_8_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
.macro lpf_8_wd4
bl lpf_8_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
function lpf_v_4_8_neon
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S
index d1083c6b561..dc50415f1f1 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S
@@ -135,6 +135,12 @@
#endif
#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+#elif defined(__APPLE__) && defined(__arm64e__)
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+
#else /* __ARM_FEATURE_PAC_DEFAULT */
#define GNU_PROPERTY_AARCH64_PAC 0
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h b/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h
new file mode 100644
index 00000000000..2e8c8ab6fb8
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
+
+void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+
+// Passing edges to this function, to allow it to switch to a more
+// optimized version for fully edged cases. Using size_t for edges,
+// to avoid ABI differences for passing more than one argument on the stack.
+void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+
+#define DEFINE_FILTER(w, h, tmp_stride) \
+static void \
+cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, const int sec_strength, \
+ const int dir, const int damping, \
+ const enum CdefEdgeFlags edges \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
+ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
+ BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, \
+ left, top, bottom, h, edges); \
+ BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \
+ sec_strength, dir, damping, h, edges \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+DEFINE_FILTER(8, 8, 16)
+DEFINE_FILTER(4, 8, 8)
+DEFINE_FILTER(4, 4, 8)
+
+static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->dir = BF(dav1d_cdef_find_dir, neon);
+ c->fb[0] = cdef_filter_8x8_neon;
+ c->fb[1] = cdef_filter_4x8_neon;
+ c->fb[2] = cdef_filter_4x4_neon;
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h
new file mode 100644
index 00000000000..48776ac8524
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+#include "asm-offsets.h"
+
+CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT);
+
+CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET);
+CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE);
+
+void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data
+ HIGHBD_DECL_SUFFIX);
+
+#define GEN_GRAIN_UV(suff) \
+void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
+ const entry buf_y[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data, \
+ const intptr_t uv \
+ HIGHBD_DECL_SUFFIX)
+
+GEN_GRAIN_UV(420);
+GEN_GRAIN_UV(422);
+GEN_GRAIN_UV(444);
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// same layout of parameters on the stack across platforms.
+void BF(dav1d_fgy_32x32, neon)(pixel *const dst,
+ const pixel *const src,
+ const ptrdiff_t stride,
+ const uint8_t scaling[SCALING_SIZE],
+ const int scaling_shift,
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int offsets[][2],
+ const int h, const ptrdiff_t clip,
+ const ptrdiff_t type
+ HIGHBD_DECL_SUFFIX);
+
+static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride,
+ const Dav1dFilmGrainData *const data, const size_t pw,
+ const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in BLOCK_SIZE^2 blocks
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
+
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ int type = 0;
+ if (data->overlap_flag && row_num)
+ type |= 1; /* overlap y */
+ if (data->overlap_flag && bx)
+ type |= 2; /* overlap x */
+
+ BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride,
+ scaling, data->scaling_shift,
+ grain_lut, offsets, bh,
+ data->clip_to_restricted_range, type
+ HIGHBD_TAIL_SUFFIX);
+ }
+}
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// parameters on the stack with the same layout across platforms.
+#define FGUV(nm, sx, sy) \
+void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \
+ const pixel *const src, \
+ const ptrdiff_t stride, \
+ const uint8_t scaling[SCALING_SIZE], \
+ const Dav1dFilmGrainData *const data, \
+ const entry grain_lut[][GRAIN_WIDTH], \
+ const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, \
+ const int offsets[][2], \
+ const ptrdiff_t h, const ptrdiff_t uv, \
+ const ptrdiff_t is_id, \
+ const ptrdiff_t type \
+ HIGHBD_DECL_SUFFIX); \
+static void \
+fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
+ const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \
+ const size_t pw, const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], const int bh, \
+ const int row_num, const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, const int uv, const int is_id \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ const int rows = 1 + (data->overlap_flag && row_num > 0); \
+ \
+ /* seed[0] contains the current row, seed[1] contains the previous */ \
+ unsigned seed[2]; \
+ for (int i = 0; i < rows; i++) { \
+ seed[i] = data->seed; \
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \
+ } \
+ \
+ int offsets[2 /* col offset */][2 /* row offset */]; \
+ \
+ /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
+ if (data->overlap_flag && bx) { \
+ /* shift previous offsets left */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[1][i] = offsets[0][i]; \
+ } \
+ \
+ /* update current offsets */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[0][i] = get_random_number(8, &seed[i]); \
+ \
+ int type = 0; \
+ if (data->overlap_flag && row_num) \
+ type |= 1; /* overlap y */ \
+ if (data->overlap_flag && bx) \
+ type |= 2; /* overlap x */ \
+ if (data->chroma_scaling_from_luma) \
+ type |= 4; \
+ \
+ BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \
+ scaling, data, grain_lut, \
+ luma_row + (bx << sx), luma_stride, \
+ offsets, bh, uv, is_id, type \
+ HIGHBD_TAIL_SUFFIX); \
+ } \
+}
+
+FGUV(420, 1, 1);
+FGUV(422, 1, 0);
+FGUV(444, 0, 0);
+
+static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon);
+
+ c->fgy_32x32xn = fgy_32x32xn_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon;
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h b/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h
new file mode 100644
index 00000000000..aef4daebbf1
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon));
+
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon));
+
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
+
+decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
+
+static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon);
+ c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon);
+ c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon);
+ c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon);
+ c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon);
+ c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon);
+ c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon);
+ c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon);
+ c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
+ c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
+ c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon);
+
+ c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon);
+ c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon);
+ c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon);
+ c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon);
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
+
+ c->pal_pred = BF(dav1d_pal_pred, neon);
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/itx.h b/chromium/third_party/dav1d/libdav1d/src/arm/itx.h
new file mode 100644
index 00000000000..2ecd086b3be
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/itx.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+decl_itx17_fns( 4, 4, neon);
+decl_itx16_fns( 4, 8, neon);
+decl_itx16_fns( 4, 16, neon);
+decl_itx16_fns( 8, 4, neon);
+decl_itx16_fns( 8, 8, neon);
+decl_itx16_fns( 8, 16, neon);
+decl_itx2_fns ( 8, 32, neon);
+decl_itx16_fns(16, 4, neon);
+decl_itx16_fns(16, 8, neon);
+decl_itx12_fns(16, 16, neon);
+decl_itx2_fns (16, 32, neon);
+decl_itx2_fns (32, 8, neon);
+decl_itx2_fns (32, 16, neon);
+decl_itx2_fns (32, 32, neon);
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
+
+static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ if (BITDEPTH == 16 && bpc != 10) return;
+
+ assign_itx17_fn( , 4, 4, neon);
+ assign_itx16_fn(R, 4, 8, neon);
+ assign_itx16_fn(R, 4, 16, neon);
+ assign_itx16_fn(R, 8, 4, neon);
+ assign_itx16_fn( , 8, 8, neon);
+ assign_itx16_fn(R, 8, 16, neon);
+ assign_itx2_fn (R, 8, 32, neon);
+ assign_itx16_fn(R, 16, 4, neon);
+ assign_itx16_fn(R, 16, 8, neon);
+ assign_itx12_fn( , 16, 16, neon);
+ assign_itx2_fn (R, 16, 32, neon);
+ assign_itx1_fn (R, 16, 64, neon);
+ assign_itx2_fn (R, 32, 8, neon);
+ assign_itx2_fn (R, 32, 16, neon);
+ assign_itx2_fn ( , 32, 32, neon);
+ assign_itx1_fn (R, 32, 64, neon);
+ assign_itx1_fn (R, 64, 16, neon);
+ assign_itx1_fn (R, 64, 32, neon);
+ assign_itx1_fn ( , 64, 64, neon);
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h b/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h
new file mode 100644
index 00000000000..9ac08d94d29
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));
+
+static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h
new file mode 100644
index 00000000000..7993dbff683
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#if ARCH_AARCH64
+void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+#else
+
+// The 8bpc version calculates things slightly differently than the reference
+// C version. That version calculates roughly this:
+// int16_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += src[idx] * fh[i];
+// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
+// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
+// sum += 1 << (bitdepth + 6 - round_bits_h);
+// Compared to the reference C version, this is the output of the first pass
+// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
+// with round_offset precompensated.
+// The 16bpc version calculates things pretty much the same way as the
+// reference C version, but with the end result subtracted by
+// 1 << (bitdepth + 6 - round_bits_h).
+void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
+ const pixel *src, ptrdiff_t stride,
+ const int16_t fh[8], intptr_t w,
+ int h, enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// int32_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += mid[idx] * fv[i];
+// sum = (sum + rounding_off_v) >> round_bits_v;
+// This function assumes that the width is a multiple of 8.
+void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
+ const int16_t *mid, int w, int h,
+ const int16_t fv[8], enum LrEdgeFlags edges,
+ ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
+
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+ ALIGN_STK_16(int16_t, mid, 68 * 384,);
+ int mid_stride = (w + 7) & ~7;
+
+ // Horizontal filter
+ BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
+ filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
+ filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
+ lpf + 6 * PXSTRIDE(stride),
+ stride, filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+
+ // Vertical filter
+ BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
+ w, h, filter[1], edges,
+ mid_stride * sizeof(*mid)
+ HIGHBD_TAIL_SUFFIX);
+}
+#endif
+
+void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 3x3 box (radius=1) */
+static void dav1d_sgr_filter1_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 5x5 box (radius=2) */
+static void dav1d_sgr_filter2_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int w, const int h,
+ const int wt HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int16_t *t2,
+ const int w, const int h,
+ const int16_t wt[2] HIGHBD_DECL_SUFFIX);
+
+static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+ ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
+ BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
+ tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
+}
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if ARCH_AARCH64
+ c->wiener[0] = BF(dav1d_wiener_filter7, neon);
+ c->wiener[1] = BF(dav1d_wiener_filter5, neon);
+#else
+ c->wiener[0] = c->wiener[1] = wiener_filter_neon;
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = sgr_filter_5x5_neon;
+ c->sgr[1] = sgr_filter_3x3_neon;
+ c->sgr[2] = sgr_filter_mix_neon;
+ }
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/mc.h b/chromium/third_party/dav1d/libdav1d/src/arm/mc.h
new file mode 100644
index 00000000000..06cd533a9b4
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/mc.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/mc.h"
+#include "src/cpu.h"
+
+decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
+decl_mc_fn(BF(dav1d_put_bilin, neon));
+
+decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_bilin, neon));
+
+decl_avg_fn(BF(dav1d_avg, neon));
+decl_w_avg_fn(BF(dav1d_w_avg, neon));
+decl_mask_fn(BF(dav1d_mask, neon));
+decl_blend_fn(BF(dav1d_blend, neon));
+decl_blend_dir_fn(BF(dav1d_blend_h, neon));
+decl_blend_dir_fn(BF(dav1d_blend_v, neon));
+
+decl_w_mask_fn(BF(dav1d_w_mask_444, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_422, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
+
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
+
+decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
+
+static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
+
+ c->avg = BF(dav1d_avg, neon);
+ c->w_avg = BF(dav1d_w_avg, neon);
+ c->mask = BF(dav1d_mask, neon);
+ c->blend = BF(dav1d_blend, neon);
+ c->blend_h = BF(dav1d_blend_h, neon);
+ c->blend_v = BF(dav1d_blend_v, neon);
+ c->w_mask[0] = BF(dav1d_w_mask_444, neon);
+ c->w_mask[1] = BF(dav1d_w_mask_422, neon);
+ c->w_mask[2] = BF(dav1d_w_mask_420, neon);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+ c->emu_edge = BF(dav1d_emu_edge, neon);
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c b/chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h
index acde030a368..4c96fc50952 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h
@@ -30,7 +30,7 @@
decl_splat_mv_fn(dav1d_splat_mv_neon);
-COLD void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
+static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;