diff options
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/arm')
14 files changed, 1484 insertions, 122 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S index db8ecffe6ea..aa6c272e718 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S @@ -668,12 +668,21 @@ def_fn_4x4 identity, flipadst .macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4s_x4 \r0, \r2, \r4, \r6 + vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + vmin.s32 \r, \r, q5 +.endr +.irp r, \r0, \r2, \r4, \r6 + vmax.s32 \r, \r, q4 +.endr + vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a - vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a + vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a vrshr.s32 \r1, q2, #12 // t4a - vrshr.s32 \r7, q4, #12 // t7a + vrshr.s32 \r7, q3, #12 // t7a vrshr.s32 \r3, q6, #12 // t5a vrshr.s32 \r5, q7, #12 // t6a @@ -682,17 +691,24 @@ def_fn_4x4 identity, flipadst vqadd.s32 q3, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a - vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5 +.irp r, q2, \r1, q3, \r3 + vmin.s32 \r, \r, q5 +.endr +.irp r, q2, \r1, q3, \r3 + vmax.s32 \r, \r, q4 +.endr + + vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6 - vrshr.s32 q4, q4, #12 // t5 + vrshr.s32 q7, q7, #12 // t5 vrshr.s32 q5, q6, #12 // t6 vqsub.s32 \r7, \r0, q3 // out7 vqadd.s32 \r0, \r0, q3 // out0 vqadd.s32 \r1, \r2, q5 // out1 vqsub.s32 q6, \r2, q5 // out6 - vqadd.s32 \r2, \r4, q4 // out2 - vqsub.s32 \r5, \r4, q4 // out5 + vqadd.s32 \r2, \r4, q7 // out2 + vqsub.s32 \r5, \r4, q7 // out5 vqadd.s32 \r3, \r6, q2 // out3 vqsub.s32 \r4, \r6, q2 // out4 vmov \r6, q6 // out6 @@ -701,6 +717,15 @@ def_fn_4x4 identity, flipadst .macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_2s_x4 \r0, \r2, \r4, \r6 + vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + vmin.s32 \r, \r, d9 +.endr +.irp r, \r0, \r2, \r4, \r6 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a @@ -715,6 +740,13 @@ def_fn_4x4 identity, flipadst vqadd.s32 d5, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a +.irp r, d4, \r1, d5, \r3 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, \r1, d5, \r3 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6 vrshr.s32 d6, d6, #12 // t5 @@ -763,19 +795,28 @@ endfunc vqadd.s32 q2, q8, q12 // t0 vqsub.s32 q3, q8, q12 // t4 + vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vqadd.s32 q4, q15, q11 // t1 vqsub.s32 q5, q15, q11 // t5 vqadd.s32 q6, q10, q14 // t2 vqsub.s32 q7, q10, q14 // t6 + vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 q10, q13, q9 // t3 vqsub.s32 q11, q13, q9 // t7 +.irp r, q2, q3, q4, q5, q6, q7, q10, q11 + vmin.s32 \r, \r, q12 +.endr +.irp r, q2, q3, q4, q5, q6, q7, q10, q11 + vmax.s32 \r, \r, q14 +.endr + vmul_vmla q8, q3, q5, d1[1], d1[0] - vmul_vmls q12, q3, q5, d1[0], d1[1] + vmul_vmls q13, q3, q5, d1[0], d1[1] vmul_vmls q14, q11, q7, d1[1], d1[0] vrshr.s32 q3, q8, #12 // t4a - vrshr.s32 q5, q12, #12 // t5a + vrshr.s32 q5, q13, #12 // t5a vmul_vmla q8, q11, q7, d1[0], d1[1] @@ -786,12 +827,24 @@ endfunc vqsub.s32 q2, q2, q6 // t2 vqadd.s32 \r7, q4, q10 // out7 vqsub.s32 q4, q4, q10 // t3 - vqneg.s32 \r7, \r7 // out7 + + vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 \r1, q3, q7 // out1 vqsub.s32 q3, q3, q7 // t6 vqadd.s32 \r6, q5, q11 // out6 vqsub.s32 q5, q5, q11 // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, q2, q4, q3, q5 + vmin.s32 \r, \r, q12 +.endr +.irp r, q2, q4, q3, q5 + vmax.s32 \r, \r, q10 +.endr + + vqneg.s32 \r7, \r7 // out7 vqneg.s32 \r1, \r1 // out1 vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12) @@ -1068,6 +1121,14 @@ function inv_dct_2s_x16_neon idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30 + // idct_8 leaves the row_clip_max/min constants in d9 and d8 +.irp r, d16, d18, d20, d22, d24, d26, d28, d30 + vmin.s32 \r, \r, d9 +.endr +.irp r, d16, d18, d20, d22, d24, d26, d28, d30 + vmax.s32 \r, \r, d8 +.endr + vld1.32 {q0, q1}, [r12, :128] sub r12, r12, #32 @@ -1099,6 +1160,13 @@ function inv_dct_2s_x16_neon vqadd.s32 d25, d29, d27 // t12 vqsub.s32 d29, d29, d27 // t13 +.irp r, d4, d17, d5, d31, d23, d19, d25, d29 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, d17, d5, d31, d23, d19, d25, d29 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a vrshr.s32 d21, d6, #12 // t9a @@ -1119,6 +1187,13 @@ function inv_dct_2s_x16_neon vqsub.s32 d25, d27, d29 // t13 vqadd.s32 d27, d27, d29 // t14 +.irp r, d4, d17, d5, d31, d19, d21, d25, d27 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, d17, d5, d31, d19, d21, d25, d27 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11 vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12 vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a @@ -1193,6 +1268,9 @@ endfunc vld1.32 {q0, q1}, [r12, :128] + vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + vqsub.s32 d5, d16, d23 // t8a vqadd.s32 d16, d16, d23 // t0a vqsub.s32 d7, d31, d24 // t9a @@ -1210,6 +1288,13 @@ endfunc vqadd.s32 d28, d25, d30 // t7a vqsub.s32 d25, d25, d30 // t15a +.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 + vmin.s32 \r, \r, d11 +.endr +.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 + vmax.s32 \r, \r, d10 +.endr + vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8 vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9 vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10 @@ -1244,6 +1329,13 @@ endfunc vqadd.s32 d20, d29, d22 // t11a vqsub.s32 d29, d29, d22 // t15a +.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 + vmax.s32 \r, \r, d10 +.endr + vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a @@ -1272,24 +1364,34 @@ endfunc vqadd.s32 \o15,d31, d26 // out15 vmov \o0, d4 .endif - vqneg.s32 \o15, \o15 // out15 vqsub.s32 d3, d29, d18 // t15a vqadd.s32 \o13,d29, d18 // out13 vqadd.s32 \o2, d17, d30 // out2 vqsub.s32 d26, d17, d30 // t14a - vqneg.s32 \o13,\o13 // out13 vqadd.s32 \o1, d19, d27 // out1 vqsub.s32 d27, d19, d27 // t10 vqadd.s32 \o14,d28, d20 // out14 vqsub.s32 d20, d28, d20 // t11 - vqneg.s32 \o1, \o1 // out1 vqadd.s32 \o3, d22, d24 // out3 vqsub.s32 d22, d22, d24 // t6 vqadd.s32 \o12,d25, d23 // out12 vqsub.s32 d23, d25, d23 // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, d2, d21, d3, d26, d27, d20, d22, d23 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d21, d3, d26, d27, d20, d22, d23 + vmax.s32 \r, \r, d10 +.endr + + vqneg.s32 \o15, \o15 // out15 + vqneg.s32 \o13,\o13 // out13 + vqneg.s32 \o1, \o1 // out1 vqneg.s32 \o3, \o3 // out3 vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) @@ -1947,6 +2049,9 @@ function inv_dct32_odd_2s_x16_neon vld1.32 {q0, q1}, [r12, :128] + vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + vqsub.s32 d5, d16, d24 // t17 vqadd.s32 d16, d16, d24 // t16 vqsub.s32 d7, d31, d23 // t30 @@ -1964,6 +2069,13 @@ function inv_dct32_odd_2s_x16_neon vqadd.s32 d25, d19, d27 // t28 vqsub.s32 d19, d19, d27 // t29 +.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 + vmin.s32 \r, \r, d11 +.endr +.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 + vmax.s32 \r, \r, d10 +.endr + vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a @@ -2000,6 +2112,13 @@ function inv_dct32_odd_2s_x16_neon vqsub.s32 d29, d31, d25 // t28a vqadd.s32 d31, d31, d25 // t31a +.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 + vmax.s32 \r, \r, d10 +.endr + vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19 @@ -2037,6 +2156,13 @@ function inv_dct32_odd_2s_x16_neon vqsub.s32 d24, d24, d19 // t27a vmov d19, d4 // out19 +.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 + vmax.s32 \r, \r, d10 +.endr + vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20 vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27 vrshr.s32 d20, d4, #12 // t20 @@ -2081,6 +2207,18 @@ function inv_txfm_horz\suffix\()_dct_32x2_neon scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct_2s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in d9 and d8, + // but here we want to use full q registers for clipping. + vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmin.s32 \r, \r, q3 +.endr +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmax.s32 \r, \r, q2 +.endr + vtrn.32 d16, d17 vtrn.32 d18, d19 vtrn.32 d20, d21 @@ -2745,14 +2883,21 @@ function inv_dct64_step1_neon vqsub.s32 d30, d23, d22 // t62 vqadd.s32 d31, d23, d22 // t63 +.irp r, q12, q13, q14, q15 + vmin.s32 \r, \r, q5 +.endr +.irp r, q12, q13, q14, q15 + vmax.s32 \r, \r, q4 +.endr + vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a vneg.s32 d4, d4 // t34a - vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a + vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a vrshr.s32 d26, d4, #12 // t34a vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a vrshr.s32 d29, d6, #12 // t61a - vrshr.s32 d25, d8, #12 // t33a + vrshr.s32 d25, d7, #12 // t33a vrshr.s32 d30, d4, #12 // t62a vqadd.s32 d16, d24, d27 // t32a @@ -2764,13 +2909,20 @@ function inv_dct64_step1_neon vqsub.s32 d21, d30, d29 // t61 vqadd.s32 d22, d30, d29 // t62 +.irp r, q8, q9, q10, q11 + vmin.s32 \r, \r, q5 +.endr +.irp r, q8, q9, q10, q11 + vmax.s32 \r, \r, q4 +.endr + vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a - vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60 + vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60 vrshr.s32 d21, d4, #12 // t61a vrshr.s32 d18, d6, #12 // t34a vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35 - vrshr.s32 d20, d8, #12 // t60 + vrshr.s32 d20, d7, #12 // t60 vrshr.s32 d19, d4, #12 // t35 vst1.32 {d16, d17, d18, d19}, [r6, :128]! @@ -2805,14 +2957,21 @@ function inv_dct64_step2_neon vqadd.s32 d30, d23, d22 // t48 vqsub.s32 d31, d23, d22 // t55 +.irp r, q12, q13, q14, q15 + vmin.s32 \r, \r, q5 +.endr +.irp r, q12, q13, q14, q15 + vmax.s32 \r, \r, q4 +.endr + vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a - vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a + vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a vrshr.s32 d25, d4, #12 // t56a vrshr.s32 d27, d6, #12 // t39a - vneg.s32 d8, d8 // t40a + vneg.s32 d7, d7 // t40a vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a - vrshr.s32 d31, d8, #12 // t40a + vrshr.s32 d31, d7, #12 // t40a vrshr.s32 d28, d4, #12 // t55a vqadd.s32 d16, d24, d29 // t32a @@ -2824,13 +2983,20 @@ function inv_dct64_step2_neon vqsub.s32 d21, d25, d28 // t55 vqadd.s32 d22, d25, d28 // t56 +.irp r, q8, q9, q10, q11 + vmin.s32 \r, \r, q5 +.endr +.irp r, q8, q9, q10, q11 + vmax.s32 \r, \r, q4 +.endr + vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a - vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47 + vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47 vrshr.s32 d18, d4, #12 // t40a vrshr.s32 d21, d6, #12 // t55a vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48 - vrshr.s32 d19, d8, #12 // t47 + vrshr.s32 d19, d7, #12 // t47 vrshr.s32 d20, d4, #12 // t48 vstr d16, [r6, #4*2*0] // t32a @@ -2916,6 +3082,17 @@ function inv_txfm_dct\suffix\()_2s_x64_neon bl inv_dct_2s_x16_neon + // idct_16 leaves the row_clip_max/min constants in d9 and d8, + // but here we want to use full q registers for clipping. + vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmin.s32 \r, \r, q3 +.endr +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmax.s32 \r, \r, q2 +.endr + store16 r6 movdup_if d0, r12, 2896*8*(1<<16), \scale @@ -2934,6 +3111,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon mov r9, #-8 + vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .macro store_addsub r0, r1, r2, r3 vld1.32 {d2}, [r6, :64]! vld1.32 {d3}, [r6, :64]! @@ -2942,16 +3121,32 @@ function inv_txfm_dct\suffix\()_2s_x64_neon vld1.32 {d4}, [r6, :64]! vqadd.s32 d7, d3, \r1 vqsub.s32 \r1, d3, \r1 + vmin.s32 d6, d6, d1 + vmin.s32 \r0, \r0, d1 vld1.32 {d5}, [r6, :64]! vqadd.s32 d2, d4, \r2 sub r6, r6, #8*4 + vmax.s32 d6, d6, d0 + vmax.s32 \r0, \r0, d0 vqsub.s32 \r2, d4, \r2 + vmin.s32 d7, d7, d1 + vmin.s32 \r1, \r1, d1 vst1.32 {d6}, [r6, :64]! vst1.32 {\r0}, [r10, :64], r9 + vmin.s32 d2, d2, d1 + vmin.s32 \r2, \r2, d1 + vmax.s32 d7, d7, d0 + vmax.s32 \r1, \r1, d0 vqadd.s32 d3, d5, \r3 vqsub.s32 \r3, d5, \r3 + vmax.s32 d2, d2, d0 + vmax.s32 \r2, \r2, d0 + vmin.s32 d3, d3, d1 + vmin.s32 \r3, \r3, d1 vst1.32 {d7}, [r6, :64]! vst1.32 {\r1}, [r10, :64], r9 + vmax.s32 d3, d3, d0 + vmax.s32 \r3, \r3, d0 vst1.32 {d2}, [r6, :64]! vst1.32 {\r2}, [r10, :64], r9 vst1.32 {d3}, [r6, :64]! @@ -2966,6 +3161,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon add r6, r6, #2*4*16 movrel_local r12, idct64_coeffs + vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear add r9, r7, r8, lsl #4 // offset 16 diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S index c9650e9d544..b1b2f8fe659 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S @@ -483,10 +483,10 @@ endfunc add \o1\().4s, v5.4s, v7.4s sub \o3\().4s, \o3\().4s, v7.4s - rshrn \o0\().4h, \o0\().4s, #12 - rshrn \o2\().4h, \o2\().4s, #12 - rshrn \o1\().4h, \o1\().4s, #12 - rshrn \o3\().4h, \o3\().4s, #12 + sqrshrn \o0\().4h, \o0\().4s, #12 + sqrshrn \o2\().4h, \o2\().4s, #12 + sqrshrn \o1\().4h, \o1\().4s, #12 + sqrshrn \o3\().4h, \o3\().4s, #12 .endm function inv_adst_4h_x4_neon, export=1 @@ -538,21 +538,21 @@ endfunc sub v4.4s, v4.4s, v2.4s // out3 sub v5.4s, v5.4s, v3.4s - rshrn v18.4h, v18.4s, #12 - rshrn2 v18.8h, v19.4s, #12 + sqrshrn v18.4h, v18.4s, #12 + sqrshrn2 v18.8h, v19.4s, #12 - rshrn \o0\().4h, v16.4s, #12 - rshrn2 \o0\().8h, v17.4s, #12 + sqrshrn \o0\().4h, v16.4s, #12 + sqrshrn2 \o0\().8h, v17.4s, #12 .ifc \o2, v17 mov v17.16b, v18.16b .endif - rshrn \o1\().4h, v6.4s, #12 - rshrn2 \o1\().8h, v7.4s, #12 + sqrshrn \o1\().4h, v6.4s, #12 + sqrshrn2 \o1\().8h, v7.4s, #12 - rshrn \o3\().4h, v4.4s, #12 - rshrn2 \o3\().8h, v5.4s, #12 + sqrshrn \o3\().4h, v4.4s, #12 + sqrshrn2 \o3\().8h, v5.4s, #12 .endm function inv_adst_8h_x4_neon, export=1 diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S index 0a0c7768b13..eee3a9636de 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S @@ -124,6 +124,13 @@ endconst .endif .endm +.macro smin_4s r0, r1, r2 + smin \r0\().4s, \r1\().4s, \r2\().4s +.endm +.macro smax_4s r0, r1, r2 + smax \r0\().4s, \r1\().4s, \r2\().4s +.endm + .macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 @@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4 \r0, \r2, \r4, \r6 + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + smin_4s \r, \r, v5 +.endr +.irp r, \r0, \r2, \r4, \r6 + smax_4s \r, \r, v4 +.endr + mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a - mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a + mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a srshr \r1\().4s, v2.4s, #12 // t4a - srshr \r7\().4s, v4.4s, #12 // t7a + srshr \r7\().4s, v3.4s, #12 // t7a srshr \r3\().4s, v6.4s, #12 // t5a srshr \r5\().4s, v7.4s, #12 // t6a @@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst sqadd v3.4s, \r7\().4s, \r5\().4s // t7 sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a - mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 +.irp r, v2, \r1, v3, \r3 + smin_4s \r, \r, v5 +.endr +.irp r, v2, \r1, v3, \r3 + smax_4s \r, \r, v4 +.endr + + mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5 mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 - srshr v4.4s, v4.4s, #12 // t5 - srshr v5.4s, v6.4s, #12 // t6 + srshr v7.4s, v7.4s, #12 // t5 + srshr v6.4s, v6.4s, #12 // t6 sqsub \r7\().4s, \r0\().4s, v3.4s // out7 sqadd \r0\().4s, \r0\().4s, v3.4s // out0 - sqadd \r1\().4s, \r2\().4s, v5.4s // out1 - sqsub v6.4s, \r2\().4s, v5.4s // out6 - sqadd \r2\().4s, \r4\().4s, v4.4s // out2 - sqsub \r5\().4s, \r4\().4s, v4.4s // out5 + sqadd \r1\().4s, \r2\().4s, v6.4s // out1 + sqsub v6.4s, \r2\().4s, v6.4s // out6 + sqadd \r2\().4s, \r4\().4s, v7.4s // out2 + sqsub \r5\().4s, \r4\().4s, v7.4s // out5 sqadd \r3\().4s, \r6\().4s, v2.4s // out3 sqsub \r4\().4s, \r6\().4s, v2.4s // out4 mov \r6\().16b, v6.16b // out6 @@ -660,8 +683,11 @@ endfunc ld1 {v0.4s}, [x16] + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + sqadd v2.4s, v16.4s, v20.4s // t0 sqsub v3.4s, v16.4s, v20.4s // t4 + mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd v4.4s, v23.4s, v19.4s // t1 sqsub v5.4s, v23.4s, v19.4s // t5 sqadd v6.4s, v18.4s, v22.4s // t2 @@ -669,6 +695,13 @@ endfunc sqadd v18.4s, v21.4s, v17.4s // t3 sqsub v19.4s, v21.4s, v17.4s // t7 +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smax_4s \r, \r, v20 +.endr + mul_mla v16, v3, v5, v0.s[3], v0.s[2] mul_mls v20, v3, v5, v0.s[2], v0.s[3] mul_mls v22, v19, v7, v0.s[3], v0.s[2] @@ -685,12 +718,24 @@ endfunc sqsub v2.4s, v2.4s, v6.4s // t2 sqadd \o7\().4s, v4.4s, v18.4s // out7 sqsub v4.4s, v4.4s, v18.4s // t3 - sqneg \o7\().4s, \o7\().4s // out7 + + mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd \o1\().4s, v3.4s, v7.4s // out1 sqsub v3.4s, v3.4s, v7.4s // t6 sqadd \o6\().4s, v5.4s, v19.4s // out6 sqsub v5.4s, v5.4s, v19.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v4, v3, v5 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v4, v3, v5 + smax_4s \r, \r, v18 +.endr + + sqneg \o7\().4s, \o7\().4s // out7 sqneg \o1\().4s, \o1\().4s // out1 mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) @@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon idct_8 v16, v18, v20, v22, v24, v26, v28, v30 + // idct_8 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smax \r\().4s, \r\().4s, v4.4s +.endr + ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #32 mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a - mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a + mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a srshr v17.4s, v2.4s, #12 // t8a - srshr v31.4s, v4.4s, #12 // t15a + srshr v31.4s, v3.4s, #12 // t15a mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a - mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a + mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a srshr v23.4s, v6.4s, #12 // t9a srshr v25.4s, v2.4s, #12 // t14a mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a - srshr v21.4s, v4.4s, #12 // t10a + srshr v21.4s, v3.4s, #12 // t10a srshr v27.4s, v6.4s, #12 // t13a - mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a + mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a srshr v19.4s, v2.4s, #12 // t11a - srshr v29.4s, v4.4s, #12 // t12a + srshr v29.4s, v3.4s, #12 // t12a ld1 {v0.4s}, [x16] @@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon sqadd v25.4s, v29.4s, v27.4s // t12 sqsub v29.4s, v29.4s, v27.4s // t13 - mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a - srshr v21.4s, v4.4s, #12 // t9a + srshr v21.4s, v7.4s, #12 // t9a srshr v27.4s, v6.4s, #12 // t14a - mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a + mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a - srshr v29.4s, v4.4s, #12 // t13a + srshr v29.4s, v7.4s, #12 // t13a neg v6.4s, v6.4s srshr v23.4s, v6.4s, #12 // t10a @@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon sqsub v25.4s, v27.4s, v29.4s // t13 sqadd v27.4s, v27.4s, v29.4s // t14 - mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a - srshr v4.4s, v4.4s, #12 // t11 - srshr v5.4s, v6.4s, #12 // t12 - mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a + srshr v7.4s, v7.4s, #12 // t11 + srshr v6.4s, v6.4s, #12 // t12 + mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a srshr v2.4s, v2.4s, #12 // t10a - srshr v3.4s, v6.4s, #12 // t13a + srshr v3.4s, v3.4s, #12 // t13a - sqadd v6.4s, v16.4s, v31.4s // out0 + sqadd v1.4s, v16.4s, v31.4s // out0 sqsub v31.4s, v16.4s, v31.4s // out15 - mov v16.16b, v6.16b + mov v16.16b, v1.16b sqadd v23.4s, v30.4s, v17.4s // out7 - sqsub v7.4s, v30.4s, v17.4s // out8 + sqsub v1.4s, v30.4s, v17.4s // out8 sqadd v17.4s, v18.4s, v27.4s // out1 sqsub v30.4s, v18.4s, v27.4s // out14 sqadd v18.4s, v20.4s, v3.4s // out2 sqsub v29.4s, v20.4s, v3.4s // out13 sqadd v3.4s, v28.4s, v19.4s // out6 sqsub v25.4s, v28.4s, v19.4s // out9 - sqadd v19.4s, v22.4s, v5.4s // out3 - sqsub v28.4s, v22.4s, v5.4s // out12 - sqadd v20.4s, v24.4s, v4.4s // out4 - sqsub v27.4s, v24.4s, v4.4s // out11 + sqadd v19.4s, v22.4s, v6.4s // out3 + sqsub v28.4s, v22.4s, v6.4s // out12 + sqadd v20.4s, v24.4s, v7.4s // out4 + sqsub v27.4s, v24.4s, v7.4s // out11 sqadd v21.4s, v26.4s, v2.4s // out5 sqsub v26.4s, v26.4s, v2.4s // out10 - mov v24.16b, v7.16b + mov v24.16b, v1.16b mov v22.16b, v3.16b ret @@ -1084,6 +1151,9 @@ endfunc ld1 {v0.4s, v1.4s}, [x16] + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + sqsub v2.4s, v16.4s, v23.4s // t8a sqadd v16.4s, v16.4s, v23.4s // t0a sqsub v3.4s, v31.4s, v24.4s // t9a @@ -1101,6 +1171,13 @@ endfunc sqadd v28.4s, v25.4s, v30.4s // t7a sqsub v25.4s, v25.4s, v30.4s // t15a +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smax_4s \r, \r, v7 +.endr + mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 @@ -1135,6 +1212,13 @@ endfunc sqadd v20.4s, v29.4s, v22.4s // t11a sqsub v29.4s, v29.4s, v22.4s // t15a +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smax_4s \r, \r, v7 +.endr + mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a @@ -1163,24 +1247,34 @@ endfunc sqadd \o15\().4s, v31.4s, v26.4s // out15 mov \o0\().16b, v4.16b .endif - sqneg \o15\().4s, \o15\().4s // out15 sqsub v3.4s, v29.4s, v18.4s // t15a sqadd \o13\().4s, v29.4s, v18.4s // out13 sqadd \o2\().4s, v17.4s, v30.4s // out2 sqsub v26.4s, v17.4s, v30.4s // t14a - sqneg \o13\().4s, \o13\().4s // out13 sqadd \o1\().4s, v19.4s, v27.4s // out1 sqsub v27.4s, v19.4s, v27.4s // t10 sqadd \o14\().4s, v28.4s, v20.4s // out14 sqsub v20.4s, v28.4s, v20.4s // t11 - sqneg \o1\().4s, \o1\().4s // out1 sqadd \o3\().4s, v22.4s, v24.4s // out3 sqsub v22.4s, v22.4s, v24.4s // t6 sqadd \o12\().4s, v25.4s, v23.4s // out12 sqsub v23.4s, v25.4s, v23.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smax_4s \r, \r, v7 +.endr + + sqneg \o15\().4s, \o15\().4s // out15 + sqneg \o13\().4s, \o13\().4s // out13 + sqneg \o1\().4s, \o1\().4s // out1 sqneg \o3\().4s, \o3\().4s // out3 mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) @@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon ld1 {v0.4s, v1.4s}, [x16] + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + sqsub v2.4s, v16.4s, v24.4s // t17 sqadd v16.4s, v16.4s, v24.4s // t16 sqsub v3.4s, v31.4s, v23.4s // t30 @@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon sqadd v25.4s, v19.4s, v27.4s // t28 sqsub v19.4s, v19.4s, v27.4s // t29 - mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a - srshr v21.4s, v4.4s, #12 // t17a + srshr v21.4s, v7.4s, #12 // t17a srshr v27.4s, v6.4s, #12 // t30a neg v2.4s, v2.4s // -> t18a - mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a + mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a srshr v19.4s, v2.4s, #12 // t18a - srshr v24.4s, v4.4s, #12 // t29a + srshr v24.4s, v7.4s, #12 // t29a mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a - mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a + mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a srshr v22.4s, v6.4s, #12 // t21a srshr v18.4s, v2.4s, #12 // t26a - neg v4.4s, v4.4s // -> t22a + neg v7.4s, v7.4s // -> t22a mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a - srshr v17.4s, v4.4s, #12 // t22a + srshr v17.4s, v7.4s, #12 // t22a srshr v20.4s, v6.4s, #12 // t25a sqsub v2.4s, v27.4s, v24.4s // t29 @@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon sqsub v29.4s, v31.4s, v25.4s // t28a sqadd v31.4s, v31.4s, v25.4s // t31a - mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 - srshr v18.4s, v4.4s, #12 // t18a + srshr v18.4s, v7.4s, #12 // t18a srshr v25.4s, v6.4s, #12 // t29a - mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 + mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28 mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 srshr v29.4s, v2.4s, #12 // t19 - srshr v24.4s, v4.4s, #12 // t28 + srshr v24.4s, v7.4s, #12 // t28 neg v6.4s, v6.4s // -> t20 mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 - mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a + mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a srshr v26.4s, v6.4s, #12 // t20 srshr v19.4s, v2.4s, #12 // t27 - neg v4.4s, v4.4s // -> t21a + neg v7.4s, v7.4s // -> t21a mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a - srshr v20.4s, v4.4s, #12 // t21a + srshr v20.4s, v7.4s, #12 // t21a srshr v28.4s, v6.4s, #12 // t26a sqsub v2.4s, v16.4s, v30.4s // t23 @@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon sqsub v21.4s, v27.4s, v22.4s // t25a sqsub v27.4s, v18.4s, v20.4s // t21 sqadd v18.4s, v18.4s, v20.4s // t18 = out18 - sqadd v4.4s, v29.4s, v26.4s // t19a = out19 + sqadd v7.4s, v29.4s, v26.4s // t19a = out19 sqsub v26.4s, v29.4s, v26.4s // t20a sqadd v29.4s, v25.4s, v28.4s // t29 = out29 sqsub v25.4s, v25.4s, v28.4s // t26 sqadd v28.4s, v24.4s, v19.4s // t28a = out28 sqsub v24.4s, v24.4s, v19.4s // t27a - mov v19.16b, v4.16b // out19 + mov v19.16b, v7.16b // out19 - mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20 mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 - srshr v20.4s, v4.4s, #12 // t20 + srshr v20.4s, v7.4s, #12 // t20 srshr v22.4s, v6.4s, #12 // t27 - mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a + mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a mov v27.16b, v22.16b // t27 - srshr v26.4s, v4.4s, #12 // t26a + srshr v26.4s, v7.4s, #12 // t26a mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 - mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 + mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25 srshr v21.4s, v6.4s, #12 // t21a srshr v22.4s, v24.4s, #12 // t22 - srshr v25.4s, v4.4s, #12 // t25 + srshr v25.4s, v7.4s, #12 // t25 - mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a - srshr v23.4s, v4.4s, #12 // t23a + srshr v23.4s, v7.4s, #12 // t23a srshr v24.4s, v6.4s, #12 // t24a ret @@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct_4s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 @@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon sqsub v30.4s, v23.4s, v22.4s // t62 sqadd v31.4s, v23.4s, v22.4s // t63 +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a - mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a + mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a neg v2.4s, v2.4s // t34a mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a srshr v26.4s, v2.4s, #12 // t34a mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a - srshr v29.4s, v4.4s, #12 // t61a + srshr v29.4s, v7.4s, #12 // t61a srshr v25.4s, v6.4s, #12 // t33a srshr v30.4s, v2.4s, #12 // t62a @@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon sqsub v21.4s, v30.4s, v29.4s // t61 sqadd v22.4s, v30.4s, v29.4s // t62 +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a - mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a + mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 srshr v21.4s, v2.4s, #12 // t61a - srshr v18.4s, v4.4s, #12 // t34a + srshr v18.4s, v7.4s, #12 // t34a mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 srshr v20.4s, v6.4s, #12 // t60 srshr v19.4s, v2.4s, #12 // t35 @@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon sqadd v30.4s, v23.4s, v22.4s // t48 sqsub v31.4s, v23.4s, v22.4s // t55 +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a - mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a + mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a srshr v25.4s, v2.4s, #12 // t56a - srshr v27.4s, v4.4s, #12 // t39a + srshr v27.4s, v7.4s, #12 // t39a neg v6.4s, v6.4s // t40a mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a srshr v31.4s, v6.4s, #12 // t40a @@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon sqsub v21.4s, v25.4s, v28.4s // t55 sqadd v22.4s, v25.4s, v28.4s // t56 +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a - mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a + mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 srshr v18.4s, v2.4s, #12 // t40a - srshr v21.4s, v4.4s, #12 // t55a + srshr v21.4s, v7.4s, #12 // t55a mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 srshr v19.4s, v6.4s, #12 // t47 srshr v20.4s, v2.4s, #12 // t48 @@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon bl inv_dct_4s_x16_neon + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + store16 x6 movz16dup_if v0.2s, w16, #2896*8, \scale @@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon mov x9, #-16 + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + .macro store_addsub r0, r1, r2, r3 ld1 {v2.4s}, [x6], #16 ld1 {v3.4s}, [x6], #16 @@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon ld1 {v4.4s}, [x6], #16 sqadd v7.4s, v3.4s, \r1 sqsub \r1, v3.4s, \r1 + smin v6.4s, v6.4s, v1.4s + smin \r0, \r0, v1.4s ld1 {v5.4s}, [x6], #16 sqadd v2.4s, v4.4s, \r2 sub x6, x6, #16*4 + smax v6.4s, v6.4s, v0.4s + smax \r0, \r0, v0.4s sqsub \r2, v4.4s, \r2 + smin v7.4s, v7.4s, v1.4s + smin \r1, \r1, v1.4s st1 {v6.4s}, [x6], #16 st1 {\r0}, [x10], x9 + smin v2.4s, v2.4s, v1.4s + smin \r2, \r2, v1.4s + smax v7.4s, v7.4s, v0.4s + smax \r1, \r1, v0.4s sqadd v3.4s, v5.4s, \r3 sqsub \r3, v5.4s, \r3 + smax v2.4s, v2.4s, v0.4s + smax \r2, \r2, v0.4s + smin v3.4s, v3.4s, v1.4s + smin \r3, \r3, v1.4s st1 {v7.4s}, [x6], #16 st1 {\r1}, [x10], x9 + smax v3.4s, v3.4s, v0.4s + smax \r3, \r3, v0.4s st1 {v2.4s}, [x6], #16 st1 {\r2}, [x10], x9 st1 {v3.4s}, [x6], #16 @@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon add x6, x6, #4*4*16 movrel x17, idct64_coeffs + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x9, x7, x8, lsl #4 // offset 16 diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S index 2b9b5c408ec..63d5de10ada 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S @@ -28,6 +28,11 @@ #include "src/arm/asm.S" #include "util.S" +// depending on how many pixels need to be stored, returns: +// x14 = (1 << 0) : 0 pixels +// x14 = (1 << 4) : inner 4 pixels +// x14 = (1 << 6) : inner 6 pixels +// x14 = 0 : all pixels .macro loop_filter wd function lpf_16_wd\wd\()_neon uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0) @@ -77,8 +82,10 @@ function lpf_16_wd\wd\()_neon mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 - b.eq 9f // if (!fm || wd < 4) return; - + b.ne 9f // if (!fm || wd < 4) return; + mov x14, #(1 << 0) + ret +9: .if \wd >= 6 movi v10.16b, #1 uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0) @@ -474,20 +481,20 @@ function lpf_16_wd\wd\()_neon bif v11.16b, v29.16b, v15.16b // out q5 .endif + mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - ret x13 + mov x14, #(1 << 6) + ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - ret x14 + mov x14, #(1 << 4) + ret .endif -9: - // Return directly without writing back any pixels - ret x15 endfunc .endm @@ -497,22 +504,34 @@ loop_filter 6 loop_filter 4 .macro lpf_16_wd16 - adr x13, 7f - adr x14, 8f bl lpf_16_wd16_neon + cbz x14, 1f + tbnz x14, #6, 7f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_16_wd8 - adr x14, 8f bl lpf_16_wd8_neon + cbz x14, 1f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_16_wd6 bl lpf_16_wd6_neon + cbz x14, 1f + ret x15 +1: .endm .macro lpf_16_wd4 bl lpf_16_wd4_neon + cbz x14, 1f + ret x15 +1: .endm function lpf_v_4_16_neon diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S index aab0230c44b..d181a3e6239 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S @@ -28,6 +28,11 @@ #include "src/arm/asm.S" #include "util.S" +// depending on how many pixels need to be stored, returns: +// x14 = (1 << 0) : 0 pixels +// x14 = (1 << 4) : inner 4 pixels +// x14 = (1 << 6) : inner 6 pixels +// x14 = 0 : all pixels .macro loop_filter wd function lpf_8_wd\wd\()_neon uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0) @@ -77,8 +82,10 @@ function lpf_8_wd\wd\()_neon mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 - b.eq 9f // if (!fm || wd < 4) return; - + b.ne 9f // if (!fm || wd < 4) return; + mov x14, #(1 << 0) + ret +9: .if \wd >= 6 movi v10.8h, #1 uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) @@ -360,20 +367,20 @@ function lpf_8_wd\wd\()_neon bif v11.16b, v29.16b, v15.16b // out q5 .endif + mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - ret x13 + mov x14, #(1 << 6) + ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - ret x14 + mov x14, #(1 << 4) + ret .endif -9: - // Return directly without writing back any pixels - ret x15 endfunc .endm @@ -383,22 +390,34 @@ loop_filter 6 loop_filter 4 .macro lpf_8_wd16 - adr x13, 7f - adr x14, 8f bl lpf_8_wd16_neon + cbz x14, 1f + tbnz x14, #6, 7f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_8_wd8 - adr x14, 8f bl lpf_8_wd8_neon + cbz x14, 1f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_8_wd6 bl lpf_8_wd6_neon + cbz x14, 1f + ret x15 +1: .endm .macro lpf_8_wd4 bl lpf_8_wd4_neon + cbz x14, 1f + ret x15 +1: .endm function lpf_v_4_8_neon diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S index d1083c6b561..dc50415f1f1 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S @@ -135,6 +135,12 @@ #endif #define GNU_PROPERTY_AARCH64_PAC (1 << 1) +#elif defined(__APPLE__) && defined(__arm64e__) + +#define GNU_PROPERTY_AARCH64_PAC 0 +#define AARCH64_SIGN_LINK_REGISTER pacibsp +#define AARCH64_VALIDATE_LINK_REGISTER autibsp + #else /* __ARM_FEATURE_PAC_DEFAULT */ #define GNU_PROPERTY_AARCH64_PAC 0 diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h b/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h new file mode 100644 index 00000000000..2e8c8ab6fb8 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h @@ -0,0 +1,88 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/cdef.h" + +decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon)); + +void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src, + ptrdiff_t src_stride, const pixel (*left)[2], + const pixel *const top, + const pixel *const bottom, int h, + enum CdefEdgeFlags edges); +void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src, + ptrdiff_t src_stride, const pixel (*left)[2], + const pixel *const top, + const pixel *const bottom, int h, + enum CdefEdgeFlags edges); + +// Passing edges to this function, to allow it to switch to a more +// optimized version for fully edged cases. Using size_t for edges, +// to avoid ABI differences for passing more than one argument on the stack. +void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, int pri_strength, + int sec_strength, int dir, int damping, int h, + size_t edges HIGHBD_DECL_SUFFIX); +void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, int pri_strength, + int sec_strength, int dir, int damping, int h, + size_t edges HIGHBD_DECL_SUFFIX); + +#define DEFINE_FILTER(w, h, tmp_stride) \ +static void \ +cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const pixel *const bottom, \ + const int pri_strength, const int sec_strength, \ + const int dir, const int damping, \ + const enum CdefEdgeFlags edges \ + HIGHBD_DECL_SUFFIX) \ +{ \ + ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ + uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \ + BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, \ + left, top, bottom, h, edges); \ + BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \ + sec_strength, dir, damping, h, edges \ + HIGHBD_TAIL_SUFFIX); \ +} + +DEFINE_FILTER(8, 8, 16) +DEFINE_FILTER(4, 8, 8) +DEFINE_FILTER(4, 4, 8) + +static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->dir = BF(dav1d_cdef_find_dir, neon); + c->fb[0] = cdef_filter_8x8_neon; + c->fb[1] = cdef_filter_4x8_neon; + c->fb[2] = cdef_filter_4x4_neon; +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h new file mode 100644 index 00000000000..48776ac8524 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h @@ -0,0 +1,204 @@ +/* + * Copyright © 2018, Niklas Haas + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/filmgrain.h" +#include "asm-offsets.h" + +CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT); + +CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET); +CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE); + +void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data + HIGHBD_DECL_SUFFIX); + +#define GEN_GRAIN_UV(suff) \ +void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \ + const entry buf_y[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data, \ + const intptr_t uv \ + HIGHBD_DECL_SUFFIX) + +GEN_GRAIN_UV(420); +GEN_GRAIN_UV(422); +GEN_GRAIN_UV(444); + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// same layout of parameters on the stack across platforms. +void BF(dav1d_fgy_32x32, neon)(pixel *const dst, + const pixel *const src, + const ptrdiff_t stride, + const uint8_t scaling[SCALING_SIZE], + const int scaling_shift, + const entry grain_lut[][GRAIN_WIDTH], + const int offsets[][2], + const int h, const ptrdiff_t clip, + const ptrdiff_t type + HIGHBD_DECL_SUFFIX); + +static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, + const Dav1dFilmGrainData *const data, const size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], + const int bh, const int row_num HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { + + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + int type = 0; + if (data->overlap_flag && row_num) + type |= 1; /* overlap y */ + if (data->overlap_flag && bx) + type |= 2; /* overlap x */ + + BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride, + scaling, data->scaling_shift, + grain_lut, offsets, bh, + data->clip_to_restricted_range, type + HIGHBD_TAIL_SUFFIX); + } +} + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// parameters on the stack with the same layout across platforms. +#define FGUV(nm, sx, sy) \ +void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \ + const pixel *const src, \ + const ptrdiff_t stride, \ + const uint8_t scaling[SCALING_SIZE], \ + const Dav1dFilmGrainData *const data, \ + const entry grain_lut[][GRAIN_WIDTH], \ + const pixel *const luma_row, \ + const ptrdiff_t luma_stride, \ + const int offsets[][2], \ + const ptrdiff_t h, const ptrdiff_t uv, \ + const ptrdiff_t is_id, \ + const ptrdiff_t type \ + HIGHBD_DECL_SUFFIX); \ +static void \ +fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ + const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \ + const size_t pw, const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], const int bh, \ + const int row_num, const pixel *const luma_row, \ + const ptrdiff_t luma_stride, const int uv, const int is_id \ + HIGHBD_DECL_SUFFIX) \ +{ \ + const int rows = 1 + (data->overlap_flag && row_num > 0); \ + \ + /* seed[0] contains the current row, seed[1] contains the previous */ \ + unsigned seed[2]; \ + for (int i = 0; i < rows; i++) { \ + seed[i] = data->seed; \ + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \ + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \ + } \ + \ + int offsets[2 /* col offset */][2 /* row offset */]; \ + \ + /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \ + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \ + if (data->overlap_flag && bx) { \ + /* shift previous offsets left */ \ + for (int i = 0; i < rows; i++) \ + offsets[1][i] = offsets[0][i]; \ + } \ + \ + /* update current offsets */ \ + for (int i = 0; i < rows; i++) \ + offsets[0][i] = get_random_number(8, &seed[i]); \ + \ + int type = 0; \ + if (data->overlap_flag && row_num) \ + type |= 1; /* overlap y */ \ + if (data->overlap_flag && bx) \ + type |= 2; /* overlap x */ \ + if (data->chroma_scaling_from_luma) \ + type |= 4; \ + \ + BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \ + scaling, data, grain_lut, \ + luma_row + (bx << sx), luma_stride, \ + offsets, bh, uv, is_id, type \ + HIGHBD_TAIL_SUFFIX); \ + } \ +} + +FGUV(420, 1, 1); +FGUV(422, 1, 0); +FGUV(444, 0, 0); + +static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon); + + c->fgy_32x32xn = fgy_32x32xn_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon; +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h b/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h new file mode 100644 index 00000000000..aef4daebbf1 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h @@ -0,0 +1,80 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/ipred.h" + +decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon)); + +decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon)); + +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon)); + +decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); + +static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon); + c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon); + c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon); + c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon); + c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon); + c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon); + c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon); + c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon); + c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon); + c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon); + c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon); + + c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon); + c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon); + c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon); + c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon); + + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon); + + c->pal_pred = BF(dav1d_pal_pred, neon); +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/itx.h b/chromium/third_party/dav1d/libdav1d/src/arm/itx.h new file mode 100644 index 00000000000..2ecd086b3be --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/itx.h @@ -0,0 +1,141 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + +decl_itx17_fns( 4, 4, neon); +decl_itx16_fns( 4, 8, neon); +decl_itx16_fns( 4, 16, neon); +decl_itx16_fns( 8, 4, neon); +decl_itx16_fns( 8, 8, neon); +decl_itx16_fns( 8, 16, neon); +decl_itx2_fns ( 8, 32, neon); +decl_itx16_fns(16, 4, neon); +decl_itx16_fns(16, 8, neon); +decl_itx12_fns(16, 16, neon); +decl_itx2_fns (16, 32, neon); +decl_itx2_fns (32, 8, neon); +decl_itx2_fns (32, 16, neon); +decl_itx2_fns (32, 32, neon); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); + +static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) { +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + if (BITDEPTH == 16 && bpc != 10) return; + + assign_itx17_fn( , 4, 4, neon); + assign_itx16_fn(R, 4, 8, neon); + assign_itx16_fn(R, 4, 16, neon); + assign_itx16_fn(R, 8, 4, neon); + assign_itx16_fn( , 8, 8, neon); + assign_itx16_fn(R, 8, 16, neon); + assign_itx2_fn (R, 8, 32, neon); + assign_itx16_fn(R, 16, 4, neon); + assign_itx16_fn(R, 16, 8, neon); + assign_itx12_fn( , 16, 16, neon); + assign_itx2_fn (R, 16, 32, neon); + assign_itx1_fn (R, 16, 64, neon); + assign_itx2_fn (R, 32, 8, neon); + assign_itx2_fn (R, 32, 16, neon); + assign_itx2_fn ( , 32, 32, neon); + assign_itx1_fn (R, 32, 64, neon); + assign_itx1_fn (R, 64, 16, neon); + assign_itx1_fn (R, 64, 32, neon); + assign_itx1_fn ( , 64, 64, neon); +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h b/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h new file mode 100644 index 00000000000..9ac08d94d29 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h @@ -0,0 +1,45 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/loopfilter.h" + +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon)); + +static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon); +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h new file mode 100644 index 00000000000..7993dbff683 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h @@ -0,0 +1,265 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/looprestoration.h" + +#if ARCH_AARCH64 +void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +#else + +// The 8bpc version calculates things slightly differently than the reference +// C version. That version calculates roughly this: +// int16_t sum = 0; +// for (int i = 0; i < 7; i++) +// sum += src[idx] * fh[i]; +// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h; +// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h; +// sum += 1 << (bitdepth + 6 - round_bits_h); +// Compared to the reference C version, this is the output of the first pass +// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e. +// with round_offset precompensated. +// The 16bpc version calculates things pretty much the same way as the +// reference C version, but with the end result subtracted by +// 1 << (bitdepth + 6 - round_bits_h). +void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4], + const pixel *src, ptrdiff_t stride, + const int16_t fh[8], intptr_t w, + int h, enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +// This calculates things slightly differently than the reference C version. +// This version calculates roughly this: +// int32_t sum = 0; +// for (int i = 0; i < 7; i++) +// sum += mid[idx] * fv[i]; +// sum = (sum + rounding_off_v) >> round_bits_v; +// This function assumes that the width is a multiple of 8. +void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride, + const int16_t *mid, int w, int h, + const int16_t fv[8], enum LrEdgeFlags edges, + ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX); + +static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + const int16_t (*const filter)[8] = params->filter; + ALIGN_STK_16(int16_t, mid, 68 * 384,); + int mid_stride = (w + 7) & ~7; + + // Horizontal filter + BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride, + filter[0], w, h, edges HIGHBD_TAIL_SUFFIX); + if (edges & LR_HAVE_TOP) + BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride, + filter[0], w, 2, edges + HIGHBD_TAIL_SUFFIX); + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL, + lpf + 6 * PXSTRIDE(stride), + stride, filter[0], w, 2, edges + HIGHBD_TAIL_SUFFIX); + + // Vertical filter + BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride], + w, h, filter[1], edges, + mid_stride * sizeof(*mid) + HIGHBD_TAIL_SUFFIX); +} +#endif + +void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const ptrdiff_t stride, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, + const int w, const int h, const int strength, + const int bitdepth_max); +void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const int32_t *a, const int16_t *b, + const int w, const int h); + +/* filter with a 3x3 box (radius=1) */ +static void dav1d_sgr_filter1_neon(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, const int h, const int strength, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; + ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,); + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; + + BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges); + if (edges & LR_HAVE_TOP) + BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], + NULL, lpf, stride, w, 2, edges); + + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); + + dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges); + dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX); + BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h); +} + +void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const ptrdiff_t stride, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, + const int w, const int h, const int strength, + const int bitdepth_max); +void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const int32_t *a, const int16_t *b, + const int w, const int h); + +/* filter with a 5x5 box (radius=2) */ +static void dav1d_sgr_filter2_neon(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, const int h, const int strength, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; + ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,); + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; + + BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges); + if (edges & LR_HAVE_TOP) + BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], + NULL, lpf, stride, w, 2, edges); + + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); + + dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges); + dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX); + BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h); +} + +void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int w, const int h, + const int wt HIGHBD_DECL_SUFFIX); +void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int16_t *t2, + const int w, const int h, + const int16_t wt[2] HIGHBD_DECL_SUFFIX); + +static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf, + w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, + tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX); +} + +static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf, + w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, + tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX); +} + +static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp1, 64 * 384,); + ALIGN_STK_16(int16_t, tmp2, 64 * 384,); + dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf, + w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); + dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf, + w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); + const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 }; + BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride, + tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX); +} + +static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + +#if ARCH_AARCH64 + c->wiener[0] = BF(dav1d_wiener_filter7, neon); + c->wiener[1] = BF(dav1d_wiener_filter5, neon); +#else + c->wiener[0] = c->wiener[1] = wiener_filter_neon; +#endif + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = sgr_filter_5x5_neon; + c->sgr[1] = sgr_filter_3x3_neon; + c->sgr[2] = sgr_filter_mix_neon; + } +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/mc.h b/chromium/third_party/dav1d/libdav1d/src/arm/mc.h new file mode 100644 index 00000000000..06cd533a9b4 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/mc.h @@ -0,0 +1,114 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "src/mc.h" +#include "src/cpu.h" + +decl_mc_fn(BF(dav1d_put_8tap_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon)); +decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon)); +decl_mc_fn(BF(dav1d_put_bilin, neon)); + +decl_mct_fn(BF(dav1d_prep_8tap_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_bilin, neon)); + +decl_avg_fn(BF(dav1d_avg, neon)); +decl_w_avg_fn(BF(dav1d_w_avg, neon)); +decl_mask_fn(BF(dav1d_mask, neon)); +decl_blend_fn(BF(dav1d_blend, neon)); +decl_blend_dir_fn(BF(dav1d_blend_h, neon)); +decl_blend_dir_fn(BF(dav1d_blend_v, neon)); + +decl_w_mask_fn(BF(dav1d_w_mask_444, neon)); +decl_w_mask_fn(BF(dav1d_w_mask_422, neon)); +decl_w_mask_fn(BF(dav1d_w_mask_420, neon)); + +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon)); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon)); + +decl_emu_edge_fn(BF(dav1d_emu_edge, neon)); + +static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = BF(dav1d_put_##name, suffix) +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = BF(dav1d_prep_##name, suffix) + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); + + c->avg = BF(dav1d_avg, neon); + c->w_avg = BF(dav1d_w_avg, neon); + c->mask = BF(dav1d_mask, neon); + c->blend = BF(dav1d_blend, neon); + c->blend_h = BF(dav1d_blend_h, neon); + c->blend_v = BF(dav1d_blend_v, neon); + c->w_mask[0] = BF(dav1d_w_mask_444, neon); + c->w_mask[1] = BF(dav1d_w_mask_422, neon); + c->w_mask[2] = BF(dav1d_w_mask_420, neon); + c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); + c->emu_edge = BF(dav1d_emu_edge, neon); +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c b/chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h index acde030a368..4c96fc50952 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c +++ b/chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h @@ -30,7 +30,7 @@ decl_splat_mv_fn(dav1d_splat_mv_neon); -COLD void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { +static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; |