diff options
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src')
82 files changed, 10542 insertions, 2158 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S index db8ecffe6ea..aa6c272e718 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S @@ -668,12 +668,21 @@ def_fn_4x4 identity, flipadst .macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4s_x4 \r0, \r2, \r4, \r6 + vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + vmin.s32 \r, \r, q5 +.endr +.irp r, \r0, \r2, \r4, \r6 + vmax.s32 \r, \r, q4 +.endr + vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a - vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a + vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a vrshr.s32 \r1, q2, #12 // t4a - vrshr.s32 \r7, q4, #12 // t7a + vrshr.s32 \r7, q3, #12 // t7a vrshr.s32 \r3, q6, #12 // t5a vrshr.s32 \r5, q7, #12 // t6a @@ -682,17 +691,24 @@ def_fn_4x4 identity, flipadst vqadd.s32 q3, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a - vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5 +.irp r, q2, \r1, q3, \r3 + vmin.s32 \r, \r, q5 +.endr +.irp r, q2, \r1, q3, \r3 + vmax.s32 \r, \r, q4 +.endr + + vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6 - vrshr.s32 q4, q4, #12 // t5 + vrshr.s32 q7, q7, #12 // t5 vrshr.s32 q5, q6, #12 // t6 vqsub.s32 \r7, \r0, q3 // out7 vqadd.s32 \r0, \r0, q3 // out0 vqadd.s32 \r1, \r2, q5 // out1 vqsub.s32 q6, \r2, q5 // out6 - vqadd.s32 \r2, \r4, q4 // out2 - vqsub.s32 \r5, \r4, q4 // out5 + vqadd.s32 \r2, \r4, q7 // out2 + vqsub.s32 \r5, \r4, q7 // out5 vqadd.s32 \r3, \r6, q2 // out3 vqsub.s32 \r4, \r6, q2 // out4 vmov \r6, q6 // out6 @@ -701,6 +717,15 @@ def_fn_4x4 identity, flipadst .macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_2s_x4 \r0, \r2, \r4, \r6 + vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + vmin.s32 \r, \r, d9 +.endr +.irp r, \r0, \r2, \r4, \r6 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a @@ -715,6 +740,13 @@ def_fn_4x4 identity, flipadst vqadd.s32 d5, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a +.irp r, d4, \r1, d5, \r3 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, \r1, d5, \r3 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6 vrshr.s32 d6, d6, #12 // t5 @@ -763,19 +795,28 @@ endfunc vqadd.s32 q2, q8, q12 // t0 vqsub.s32 q3, q8, q12 // t4 + vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vqadd.s32 q4, q15, q11 // t1 vqsub.s32 q5, q15, q11 // t5 vqadd.s32 q6, q10, q14 // t2 vqsub.s32 q7, q10, q14 // t6 + vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 q10, q13, q9 // t3 vqsub.s32 q11, q13, q9 // t7 +.irp r, q2, q3, q4, q5, q6, q7, q10, q11 + vmin.s32 \r, \r, q12 +.endr +.irp r, q2, q3, q4, q5, q6, q7, q10, q11 + vmax.s32 \r, \r, q14 +.endr + vmul_vmla q8, q3, q5, d1[1], d1[0] - vmul_vmls q12, q3, q5, d1[0], d1[1] + vmul_vmls q13, q3, q5, d1[0], d1[1] vmul_vmls q14, q11, q7, d1[1], d1[0] vrshr.s32 q3, q8, #12 // t4a - vrshr.s32 q5, q12, #12 // t5a + vrshr.s32 q5, q13, #12 // t5a vmul_vmla q8, q11, q7, d1[0], d1[1] @@ -786,12 +827,24 @@ endfunc vqsub.s32 q2, q2, q6 // t2 vqadd.s32 \r7, q4, q10 // out7 vqsub.s32 q4, q4, q10 // t3 - vqneg.s32 \r7, \r7 // out7 + + vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 \r1, q3, q7 // out1 vqsub.s32 q3, q3, q7 // t6 vqadd.s32 \r6, q5, q11 // out6 vqsub.s32 q5, q5, q11 // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, q2, q4, q3, q5 + vmin.s32 \r, \r, q12 +.endr +.irp r, q2, q4, q3, q5 + vmax.s32 \r, \r, q10 +.endr + + vqneg.s32 \r7, \r7 // out7 vqneg.s32 \r1, \r1 // out1 vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12) @@ -1068,6 +1121,14 @@ function inv_dct_2s_x16_neon idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30 + // idct_8 leaves the row_clip_max/min constants in d9 and d8 +.irp r, d16, d18, d20, d22, d24, d26, d28, d30 + vmin.s32 \r, \r, d9 +.endr +.irp r, d16, d18, d20, d22, d24, d26, d28, d30 + vmax.s32 \r, \r, d8 +.endr + vld1.32 {q0, q1}, [r12, :128] sub r12, r12, #32 @@ -1099,6 +1160,13 @@ function inv_dct_2s_x16_neon vqadd.s32 d25, d29, d27 // t12 vqsub.s32 d29, d29, d27 // t13 +.irp r, d4, d17, d5, d31, d23, d19, d25, d29 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, d17, d5, d31, d23, d19, d25, d29 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a vrshr.s32 d21, d6, #12 // t9a @@ -1119,6 +1187,13 @@ function inv_dct_2s_x16_neon vqsub.s32 d25, d27, d29 // t13 vqadd.s32 d27, d27, d29 // t14 +.irp r, d4, d17, d5, d31, d19, d21, d25, d27 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, d17, d5, d31, d19, d21, d25, d27 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11 vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12 vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a @@ -1193,6 +1268,9 @@ endfunc vld1.32 {q0, q1}, [r12, :128] + vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + vqsub.s32 d5, d16, d23 // t8a vqadd.s32 d16, d16, d23 // t0a vqsub.s32 d7, d31, d24 // t9a @@ -1210,6 +1288,13 @@ endfunc vqadd.s32 d28, d25, d30 // t7a vqsub.s32 d25, d25, d30 // t15a +.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 + vmin.s32 \r, \r, d11 +.endr +.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 + vmax.s32 \r, \r, d10 +.endr + vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8 vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9 vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10 @@ -1244,6 +1329,13 @@ endfunc vqadd.s32 d20, d29, d22 // t11a vqsub.s32 d29, d29, d22 // t15a +.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 + vmax.s32 \r, \r, d10 +.endr + vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a @@ -1272,24 +1364,34 @@ endfunc vqadd.s32 \o15,d31, d26 // out15 vmov \o0, d4 .endif - vqneg.s32 \o15, \o15 // out15 vqsub.s32 d3, d29, d18 // t15a vqadd.s32 \o13,d29, d18 // out13 vqadd.s32 \o2, d17, d30 // out2 vqsub.s32 d26, d17, d30 // t14a - vqneg.s32 \o13,\o13 // out13 vqadd.s32 \o1, d19, d27 // out1 vqsub.s32 d27, d19, d27 // t10 vqadd.s32 \o14,d28, d20 // out14 vqsub.s32 d20, d28, d20 // t11 - vqneg.s32 \o1, \o1 // out1 vqadd.s32 \o3, d22, d24 // out3 vqsub.s32 d22, d22, d24 // t6 vqadd.s32 \o12,d25, d23 // out12 vqsub.s32 d23, d25, d23 // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, d2, d21, d3, d26, d27, d20, d22, d23 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d21, d3, d26, d27, d20, d22, d23 + vmax.s32 \r, \r, d10 +.endr + + vqneg.s32 \o15, \o15 // out15 + vqneg.s32 \o13,\o13 // out13 + vqneg.s32 \o1, \o1 // out1 vqneg.s32 \o3, \o3 // out3 vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) @@ -1947,6 +2049,9 @@ function inv_dct32_odd_2s_x16_neon vld1.32 {q0, q1}, [r12, :128] + vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + vqsub.s32 d5, d16, d24 // t17 vqadd.s32 d16, d16, d24 // t16 vqsub.s32 d7, d31, d23 // t30 @@ -1964,6 +2069,13 @@ function inv_dct32_odd_2s_x16_neon vqadd.s32 d25, d19, d27 // t28 vqsub.s32 d19, d19, d27 // t29 +.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 + vmin.s32 \r, \r, d11 +.endr +.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 + vmax.s32 \r, \r, d10 +.endr + vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a @@ -2000,6 +2112,13 @@ function inv_dct32_odd_2s_x16_neon vqsub.s32 d29, d31, d25 // t28a vqadd.s32 d31, d31, d25 // t31a +.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 + vmax.s32 \r, \r, d10 +.endr + vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19 @@ -2037,6 +2156,13 @@ function inv_dct32_odd_2s_x16_neon vqsub.s32 d24, d24, d19 // t27a vmov d19, d4 // out19 +.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 + vmax.s32 \r, \r, d10 +.endr + vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20 vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27 vrshr.s32 d20, d4, #12 // t20 @@ -2081,6 +2207,18 @@ function inv_txfm_horz\suffix\()_dct_32x2_neon scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct_2s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in d9 and d8, + // but here we want to use full q registers for clipping. + vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmin.s32 \r, \r, q3 +.endr +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmax.s32 \r, \r, q2 +.endr + vtrn.32 d16, d17 vtrn.32 d18, d19 vtrn.32 d20, d21 @@ -2745,14 +2883,21 @@ function inv_dct64_step1_neon vqsub.s32 d30, d23, d22 // t62 vqadd.s32 d31, d23, d22 // t63 +.irp r, q12, q13, q14, q15 + vmin.s32 \r, \r, q5 +.endr +.irp r, q12, q13, q14, q15 + vmax.s32 \r, \r, q4 +.endr + vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a vneg.s32 d4, d4 // t34a - vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a + vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a vrshr.s32 d26, d4, #12 // t34a vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a vrshr.s32 d29, d6, #12 // t61a - vrshr.s32 d25, d8, #12 // t33a + vrshr.s32 d25, d7, #12 // t33a vrshr.s32 d30, d4, #12 // t62a vqadd.s32 d16, d24, d27 // t32a @@ -2764,13 +2909,20 @@ function inv_dct64_step1_neon vqsub.s32 d21, d30, d29 // t61 vqadd.s32 d22, d30, d29 // t62 +.irp r, q8, q9, q10, q11 + vmin.s32 \r, \r, q5 +.endr +.irp r, q8, q9, q10, q11 + vmax.s32 \r, \r, q4 +.endr + vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a - vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60 + vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60 vrshr.s32 d21, d4, #12 // t61a vrshr.s32 d18, d6, #12 // t34a vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35 - vrshr.s32 d20, d8, #12 // t60 + vrshr.s32 d20, d7, #12 // t60 vrshr.s32 d19, d4, #12 // t35 vst1.32 {d16, d17, d18, d19}, [r6, :128]! @@ -2805,14 +2957,21 @@ function inv_dct64_step2_neon vqadd.s32 d30, d23, d22 // t48 vqsub.s32 d31, d23, d22 // t55 +.irp r, q12, q13, q14, q15 + vmin.s32 \r, \r, q5 +.endr +.irp r, q12, q13, q14, q15 + vmax.s32 \r, \r, q4 +.endr + vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a - vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a + vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a vrshr.s32 d25, d4, #12 // t56a vrshr.s32 d27, d6, #12 // t39a - vneg.s32 d8, d8 // t40a + vneg.s32 d7, d7 // t40a vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a - vrshr.s32 d31, d8, #12 // t40a + vrshr.s32 d31, d7, #12 // t40a vrshr.s32 d28, d4, #12 // t55a vqadd.s32 d16, d24, d29 // t32a @@ -2824,13 +2983,20 @@ function inv_dct64_step2_neon vqsub.s32 d21, d25, d28 // t55 vqadd.s32 d22, d25, d28 // t56 +.irp r, q8, q9, q10, q11 + vmin.s32 \r, \r, q5 +.endr +.irp r, q8, q9, q10, q11 + vmax.s32 \r, \r, q4 +.endr + vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a - vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47 + vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47 vrshr.s32 d18, d4, #12 // t40a vrshr.s32 d21, d6, #12 // t55a vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48 - vrshr.s32 d19, d8, #12 // t47 + vrshr.s32 d19, d7, #12 // t47 vrshr.s32 d20, d4, #12 // t48 vstr d16, [r6, #4*2*0] // t32a @@ -2916,6 +3082,17 @@ function inv_txfm_dct\suffix\()_2s_x64_neon bl inv_dct_2s_x16_neon + // idct_16 leaves the row_clip_max/min constants in d9 and d8, + // but here we want to use full q registers for clipping. + vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmin.s32 \r, \r, q3 +.endr +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmax.s32 \r, \r, q2 +.endr + store16 r6 movdup_if d0, r12, 2896*8*(1<<16), \scale @@ -2934,6 +3111,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon mov r9, #-8 + vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .macro store_addsub r0, r1, r2, r3 vld1.32 {d2}, [r6, :64]! vld1.32 {d3}, [r6, :64]! @@ -2942,16 +3121,32 @@ function inv_txfm_dct\suffix\()_2s_x64_neon vld1.32 {d4}, [r6, :64]! vqadd.s32 d7, d3, \r1 vqsub.s32 \r1, d3, \r1 + vmin.s32 d6, d6, d1 + vmin.s32 \r0, \r0, d1 vld1.32 {d5}, [r6, :64]! vqadd.s32 d2, d4, \r2 sub r6, r6, #8*4 + vmax.s32 d6, d6, d0 + vmax.s32 \r0, \r0, d0 vqsub.s32 \r2, d4, \r2 + vmin.s32 d7, d7, d1 + vmin.s32 \r1, \r1, d1 vst1.32 {d6}, [r6, :64]! vst1.32 {\r0}, [r10, :64], r9 + vmin.s32 d2, d2, d1 + vmin.s32 \r2, \r2, d1 + vmax.s32 d7, d7, d0 + vmax.s32 \r1, \r1, d0 vqadd.s32 d3, d5, \r3 vqsub.s32 \r3, d5, \r3 + vmax.s32 d2, d2, d0 + vmax.s32 \r2, \r2, d0 + vmin.s32 d3, d3, d1 + vmin.s32 \r3, \r3, d1 vst1.32 {d7}, [r6, :64]! vst1.32 {\r1}, [r10, :64], r9 + vmax.s32 d3, d3, d0 + vmax.s32 \r3, \r3, d0 vst1.32 {d2}, [r6, :64]! vst1.32 {\r2}, [r10, :64], r9 vst1.32 {d3}, [r6, :64]! @@ -2966,6 +3161,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon add r6, r6, #2*4*16 movrel_local r12, idct64_coeffs + vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear add r9, r7, r8, lsl #4 // offset 16 diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S index c9650e9d544..b1b2f8fe659 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S @@ -483,10 +483,10 @@ endfunc add \o1\().4s, v5.4s, v7.4s sub \o3\().4s, \o3\().4s, v7.4s - rshrn \o0\().4h, \o0\().4s, #12 - rshrn \o2\().4h, \o2\().4s, #12 - rshrn \o1\().4h, \o1\().4s, #12 - rshrn \o3\().4h, \o3\().4s, #12 + sqrshrn \o0\().4h, \o0\().4s, #12 + sqrshrn \o2\().4h, \o2\().4s, #12 + sqrshrn \o1\().4h, \o1\().4s, #12 + sqrshrn \o3\().4h, \o3\().4s, #12 .endm function inv_adst_4h_x4_neon, export=1 @@ -538,21 +538,21 @@ endfunc sub v4.4s, v4.4s, v2.4s // out3 sub v5.4s, v5.4s, v3.4s - rshrn v18.4h, v18.4s, #12 - rshrn2 v18.8h, v19.4s, #12 + sqrshrn v18.4h, v18.4s, #12 + sqrshrn2 v18.8h, v19.4s, #12 - rshrn \o0\().4h, v16.4s, #12 - rshrn2 \o0\().8h, v17.4s, #12 + sqrshrn \o0\().4h, v16.4s, #12 + sqrshrn2 \o0\().8h, v17.4s, #12 .ifc \o2, v17 mov v17.16b, v18.16b .endif - rshrn \o1\().4h, v6.4s, #12 - rshrn2 \o1\().8h, v7.4s, #12 + sqrshrn \o1\().4h, v6.4s, #12 + sqrshrn2 \o1\().8h, v7.4s, #12 - rshrn \o3\().4h, v4.4s, #12 - rshrn2 \o3\().8h, v5.4s, #12 + sqrshrn \o3\().4h, v4.4s, #12 + sqrshrn2 \o3\().8h, v5.4s, #12 .endm function inv_adst_8h_x4_neon, export=1 diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S index 0a0c7768b13..eee3a9636de 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S @@ -124,6 +124,13 @@ endconst .endif .endm +.macro smin_4s r0, r1, r2 + smin \r0\().4s, \r1\().4s, \r2\().4s +.endm +.macro smax_4s r0, r1, r2 + smax \r0\().4s, \r1\().4s, \r2\().4s +.endm + .macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 @@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4 \r0, \r2, \r4, \r6 + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + smin_4s \r, \r, v5 +.endr +.irp r, \r0, \r2, \r4, \r6 + smax_4s \r, \r, v4 +.endr + mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a - mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a + mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a srshr \r1\().4s, v2.4s, #12 // t4a - srshr \r7\().4s, v4.4s, #12 // t7a + srshr \r7\().4s, v3.4s, #12 // t7a srshr \r3\().4s, v6.4s, #12 // t5a srshr \r5\().4s, v7.4s, #12 // t6a @@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst sqadd v3.4s, \r7\().4s, \r5\().4s // t7 sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a - mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 +.irp r, v2, \r1, v3, \r3 + smin_4s \r, \r, v5 +.endr +.irp r, v2, \r1, v3, \r3 + smax_4s \r, \r, v4 +.endr + + mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5 mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 - srshr v4.4s, v4.4s, #12 // t5 - srshr v5.4s, v6.4s, #12 // t6 + srshr v7.4s, v7.4s, #12 // t5 + srshr v6.4s, v6.4s, #12 // t6 sqsub \r7\().4s, \r0\().4s, v3.4s // out7 sqadd \r0\().4s, \r0\().4s, v3.4s // out0 - sqadd \r1\().4s, \r2\().4s, v5.4s // out1 - sqsub v6.4s, \r2\().4s, v5.4s // out6 - sqadd \r2\().4s, \r4\().4s, v4.4s // out2 - sqsub \r5\().4s, \r4\().4s, v4.4s // out5 + sqadd \r1\().4s, \r2\().4s, v6.4s // out1 + sqsub v6.4s, \r2\().4s, v6.4s // out6 + sqadd \r2\().4s, \r4\().4s, v7.4s // out2 + sqsub \r5\().4s, \r4\().4s, v7.4s // out5 sqadd \r3\().4s, \r6\().4s, v2.4s // out3 sqsub \r4\().4s, \r6\().4s, v2.4s // out4 mov \r6\().16b, v6.16b // out6 @@ -660,8 +683,11 @@ endfunc ld1 {v0.4s}, [x16] + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + sqadd v2.4s, v16.4s, v20.4s // t0 sqsub v3.4s, v16.4s, v20.4s // t4 + mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd v4.4s, v23.4s, v19.4s // t1 sqsub v5.4s, v23.4s, v19.4s // t5 sqadd v6.4s, v18.4s, v22.4s // t2 @@ -669,6 +695,13 @@ endfunc sqadd v18.4s, v21.4s, v17.4s // t3 sqsub v19.4s, v21.4s, v17.4s // t7 +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smax_4s \r, \r, v20 +.endr + mul_mla v16, v3, v5, v0.s[3], v0.s[2] mul_mls v20, v3, v5, v0.s[2], v0.s[3] mul_mls v22, v19, v7, v0.s[3], v0.s[2] @@ -685,12 +718,24 @@ endfunc sqsub v2.4s, v2.4s, v6.4s // t2 sqadd \o7\().4s, v4.4s, v18.4s // out7 sqsub v4.4s, v4.4s, v18.4s // t3 - sqneg \o7\().4s, \o7\().4s // out7 + + mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd \o1\().4s, v3.4s, v7.4s // out1 sqsub v3.4s, v3.4s, v7.4s // t6 sqadd \o6\().4s, v5.4s, v19.4s // out6 sqsub v5.4s, v5.4s, v19.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v4, v3, v5 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v4, v3, v5 + smax_4s \r, \r, v18 +.endr + + sqneg \o7\().4s, \o7\().4s // out7 sqneg \o1\().4s, \o1\().4s // out1 mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) @@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon idct_8 v16, v18, v20, v22, v24, v26, v28, v30 + // idct_8 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smax \r\().4s, \r\().4s, v4.4s +.endr + ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #32 mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a - mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a + mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a srshr v17.4s, v2.4s, #12 // t8a - srshr v31.4s, v4.4s, #12 // t15a + srshr v31.4s, v3.4s, #12 // t15a mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a - mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a + mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a srshr v23.4s, v6.4s, #12 // t9a srshr v25.4s, v2.4s, #12 // t14a mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a - srshr v21.4s, v4.4s, #12 // t10a + srshr v21.4s, v3.4s, #12 // t10a srshr v27.4s, v6.4s, #12 // t13a - mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a + mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a srshr v19.4s, v2.4s, #12 // t11a - srshr v29.4s, v4.4s, #12 // t12a + srshr v29.4s, v3.4s, #12 // t12a ld1 {v0.4s}, [x16] @@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon sqadd v25.4s, v29.4s, v27.4s // t12 sqsub v29.4s, v29.4s, v27.4s // t13 - mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a - srshr v21.4s, v4.4s, #12 // t9a + srshr v21.4s, v7.4s, #12 // t9a srshr v27.4s, v6.4s, #12 // t14a - mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a + mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a - srshr v29.4s, v4.4s, #12 // t13a + srshr v29.4s, v7.4s, #12 // t13a neg v6.4s, v6.4s srshr v23.4s, v6.4s, #12 // t10a @@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon sqsub v25.4s, v27.4s, v29.4s // t13 sqadd v27.4s, v27.4s, v29.4s // t14 - mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a - srshr v4.4s, v4.4s, #12 // t11 - srshr v5.4s, v6.4s, #12 // t12 - mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a + srshr v7.4s, v7.4s, #12 // t11 + srshr v6.4s, v6.4s, #12 // t12 + mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a srshr v2.4s, v2.4s, #12 // t10a - srshr v3.4s, v6.4s, #12 // t13a + srshr v3.4s, v3.4s, #12 // t13a - sqadd v6.4s, v16.4s, v31.4s // out0 + sqadd v1.4s, v16.4s, v31.4s // out0 sqsub v31.4s, v16.4s, v31.4s // out15 - mov v16.16b, v6.16b + mov v16.16b, v1.16b sqadd v23.4s, v30.4s, v17.4s // out7 - sqsub v7.4s, v30.4s, v17.4s // out8 + sqsub v1.4s, v30.4s, v17.4s // out8 sqadd v17.4s, v18.4s, v27.4s // out1 sqsub v30.4s, v18.4s, v27.4s // out14 sqadd v18.4s, v20.4s, v3.4s // out2 sqsub v29.4s, v20.4s, v3.4s // out13 sqadd v3.4s, v28.4s, v19.4s // out6 sqsub v25.4s, v28.4s, v19.4s // out9 - sqadd v19.4s, v22.4s, v5.4s // out3 - sqsub v28.4s, v22.4s, v5.4s // out12 - sqadd v20.4s, v24.4s, v4.4s // out4 - sqsub v27.4s, v24.4s, v4.4s // out11 + sqadd v19.4s, v22.4s, v6.4s // out3 + sqsub v28.4s, v22.4s, v6.4s // out12 + sqadd v20.4s, v24.4s, v7.4s // out4 + sqsub v27.4s, v24.4s, v7.4s // out11 sqadd v21.4s, v26.4s, v2.4s // out5 sqsub v26.4s, v26.4s, v2.4s // out10 - mov v24.16b, v7.16b + mov v24.16b, v1.16b mov v22.16b, v3.16b ret @@ -1084,6 +1151,9 @@ endfunc ld1 {v0.4s, v1.4s}, [x16] + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + sqsub v2.4s, v16.4s, v23.4s // t8a sqadd v16.4s, v16.4s, v23.4s // t0a sqsub v3.4s, v31.4s, v24.4s // t9a @@ -1101,6 +1171,13 @@ endfunc sqadd v28.4s, v25.4s, v30.4s // t7a sqsub v25.4s, v25.4s, v30.4s // t15a +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smax_4s \r, \r, v7 +.endr + mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 @@ -1135,6 +1212,13 @@ endfunc sqadd v20.4s, v29.4s, v22.4s // t11a sqsub v29.4s, v29.4s, v22.4s // t15a +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smax_4s \r, \r, v7 +.endr + mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a @@ -1163,24 +1247,34 @@ endfunc sqadd \o15\().4s, v31.4s, v26.4s // out15 mov \o0\().16b, v4.16b .endif - sqneg \o15\().4s, \o15\().4s // out15 sqsub v3.4s, v29.4s, v18.4s // t15a sqadd \o13\().4s, v29.4s, v18.4s // out13 sqadd \o2\().4s, v17.4s, v30.4s // out2 sqsub v26.4s, v17.4s, v30.4s // t14a - sqneg \o13\().4s, \o13\().4s // out13 sqadd \o1\().4s, v19.4s, v27.4s // out1 sqsub v27.4s, v19.4s, v27.4s // t10 sqadd \o14\().4s, v28.4s, v20.4s // out14 sqsub v20.4s, v28.4s, v20.4s // t11 - sqneg \o1\().4s, \o1\().4s // out1 sqadd \o3\().4s, v22.4s, v24.4s // out3 sqsub v22.4s, v22.4s, v24.4s // t6 sqadd \o12\().4s, v25.4s, v23.4s // out12 sqsub v23.4s, v25.4s, v23.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smax_4s \r, \r, v7 +.endr + + sqneg \o15\().4s, \o15\().4s // out15 + sqneg \o13\().4s, \o13\().4s // out13 + sqneg \o1\().4s, \o1\().4s // out1 sqneg \o3\().4s, \o3\().4s // out3 mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) @@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon ld1 {v0.4s, v1.4s}, [x16] + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + sqsub v2.4s, v16.4s, v24.4s // t17 sqadd v16.4s, v16.4s, v24.4s // t16 sqsub v3.4s, v31.4s, v23.4s // t30 @@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon sqadd v25.4s, v19.4s, v27.4s // t28 sqsub v19.4s, v19.4s, v27.4s // t29 - mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a - srshr v21.4s, v4.4s, #12 // t17a + srshr v21.4s, v7.4s, #12 // t17a srshr v27.4s, v6.4s, #12 // t30a neg v2.4s, v2.4s // -> t18a - mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a + mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a srshr v19.4s, v2.4s, #12 // t18a - srshr v24.4s, v4.4s, #12 // t29a + srshr v24.4s, v7.4s, #12 // t29a mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a - mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a + mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a srshr v22.4s, v6.4s, #12 // t21a srshr v18.4s, v2.4s, #12 // t26a - neg v4.4s, v4.4s // -> t22a + neg v7.4s, v7.4s // -> t22a mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a - srshr v17.4s, v4.4s, #12 // t22a + srshr v17.4s, v7.4s, #12 // t22a srshr v20.4s, v6.4s, #12 // t25a sqsub v2.4s, v27.4s, v24.4s // t29 @@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon sqsub v29.4s, v31.4s, v25.4s // t28a sqadd v31.4s, v31.4s, v25.4s // t31a - mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 - srshr v18.4s, v4.4s, #12 // t18a + srshr v18.4s, v7.4s, #12 // t18a srshr v25.4s, v6.4s, #12 // t29a - mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 + mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28 mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 srshr v29.4s, v2.4s, #12 // t19 - srshr v24.4s, v4.4s, #12 // t28 + srshr v24.4s, v7.4s, #12 // t28 neg v6.4s, v6.4s // -> t20 mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 - mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a + mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a srshr v26.4s, v6.4s, #12 // t20 srshr v19.4s, v2.4s, #12 // t27 - neg v4.4s, v4.4s // -> t21a + neg v7.4s, v7.4s // -> t21a mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a - srshr v20.4s, v4.4s, #12 // t21a + srshr v20.4s, v7.4s, #12 // t21a srshr v28.4s, v6.4s, #12 // t26a sqsub v2.4s, v16.4s, v30.4s // t23 @@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon sqsub v21.4s, v27.4s, v22.4s // t25a sqsub v27.4s, v18.4s, v20.4s // t21 sqadd v18.4s, v18.4s, v20.4s // t18 = out18 - sqadd v4.4s, v29.4s, v26.4s // t19a = out19 + sqadd v7.4s, v29.4s, v26.4s // t19a = out19 sqsub v26.4s, v29.4s, v26.4s // t20a sqadd v29.4s, v25.4s, v28.4s // t29 = out29 sqsub v25.4s, v25.4s, v28.4s // t26 sqadd v28.4s, v24.4s, v19.4s // t28a = out28 sqsub v24.4s, v24.4s, v19.4s // t27a - mov v19.16b, v4.16b // out19 + mov v19.16b, v7.16b // out19 - mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20 mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 - srshr v20.4s, v4.4s, #12 // t20 + srshr v20.4s, v7.4s, #12 // t20 srshr v22.4s, v6.4s, #12 // t27 - mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a + mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a mov v27.16b, v22.16b // t27 - srshr v26.4s, v4.4s, #12 // t26a + srshr v26.4s, v7.4s, #12 // t26a mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 - mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 + mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25 srshr v21.4s, v6.4s, #12 // t21a srshr v22.4s, v24.4s, #12 // t22 - srshr v25.4s, v4.4s, #12 // t25 + srshr v25.4s, v7.4s, #12 // t25 - mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a - srshr v23.4s, v4.4s, #12 // t23a + srshr v23.4s, v7.4s, #12 // t23a srshr v24.4s, v6.4s, #12 // t24a ret @@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct_4s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 @@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon sqsub v30.4s, v23.4s, v22.4s // t62 sqadd v31.4s, v23.4s, v22.4s // t63 +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a - mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a + mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a neg v2.4s, v2.4s // t34a mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a srshr v26.4s, v2.4s, #12 // t34a mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a - srshr v29.4s, v4.4s, #12 // t61a + srshr v29.4s, v7.4s, #12 // t61a srshr v25.4s, v6.4s, #12 // t33a srshr v30.4s, v2.4s, #12 // t62a @@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon sqsub v21.4s, v30.4s, v29.4s // t61 sqadd v22.4s, v30.4s, v29.4s // t62 +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a - mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a + mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 srshr v21.4s, v2.4s, #12 // t61a - srshr v18.4s, v4.4s, #12 // t34a + srshr v18.4s, v7.4s, #12 // t34a mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 srshr v20.4s, v6.4s, #12 // t60 srshr v19.4s, v2.4s, #12 // t35 @@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon sqadd v30.4s, v23.4s, v22.4s // t48 sqsub v31.4s, v23.4s, v22.4s // t55 +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a - mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a + mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a srshr v25.4s, v2.4s, #12 // t56a - srshr v27.4s, v4.4s, #12 // t39a + srshr v27.4s, v7.4s, #12 // t39a neg v6.4s, v6.4s // t40a mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a srshr v31.4s, v6.4s, #12 // t40a @@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon sqsub v21.4s, v25.4s, v28.4s // t55 sqadd v22.4s, v25.4s, v28.4s // t56 +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a - mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a + mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 srshr v18.4s, v2.4s, #12 // t40a - srshr v21.4s, v4.4s, #12 // t55a + srshr v21.4s, v7.4s, #12 // t55a mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 srshr v19.4s, v6.4s, #12 // t47 srshr v20.4s, v2.4s, #12 // t48 @@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon bl inv_dct_4s_x16_neon + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + store16 x6 movz16dup_if v0.2s, w16, #2896*8, \scale @@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon mov x9, #-16 + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + .macro store_addsub r0, r1, r2, r3 ld1 {v2.4s}, [x6], #16 ld1 {v3.4s}, [x6], #16 @@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon ld1 {v4.4s}, [x6], #16 sqadd v7.4s, v3.4s, \r1 sqsub \r1, v3.4s, \r1 + smin v6.4s, v6.4s, v1.4s + smin \r0, \r0, v1.4s ld1 {v5.4s}, [x6], #16 sqadd v2.4s, v4.4s, \r2 sub x6, x6, #16*4 + smax v6.4s, v6.4s, v0.4s + smax \r0, \r0, v0.4s sqsub \r2, v4.4s, \r2 + smin v7.4s, v7.4s, v1.4s + smin \r1, \r1, v1.4s st1 {v6.4s}, [x6], #16 st1 {\r0}, [x10], x9 + smin v2.4s, v2.4s, v1.4s + smin \r2, \r2, v1.4s + smax v7.4s, v7.4s, v0.4s + smax \r1, \r1, v0.4s sqadd v3.4s, v5.4s, \r3 sqsub \r3, v5.4s, \r3 + smax v2.4s, v2.4s, v0.4s + smax \r2, \r2, v0.4s + smin v3.4s, v3.4s, v1.4s + smin \r3, \r3, v1.4s st1 {v7.4s}, [x6], #16 st1 {\r1}, [x10], x9 + smax v3.4s, v3.4s, v0.4s + smax \r3, \r3, v0.4s st1 {v2.4s}, [x6], #16 st1 {\r2}, [x10], x9 st1 {v3.4s}, [x6], #16 @@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon add x6, x6, #4*4*16 movrel x17, idct64_coeffs + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x9, x7, x8, lsl #4 // offset 16 diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S index 2b9b5c408ec..63d5de10ada 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S @@ -28,6 +28,11 @@ #include "src/arm/asm.S" #include "util.S" +// depending on how many pixels need to be stored, returns: +// x14 = (1 << 0) : 0 pixels +// x14 = (1 << 4) : inner 4 pixels +// x14 = (1 << 6) : inner 6 pixels +// x14 = 0 : all pixels .macro loop_filter wd function lpf_16_wd\wd\()_neon uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0) @@ -77,8 +82,10 @@ function lpf_16_wd\wd\()_neon mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 - b.eq 9f // if (!fm || wd < 4) return; - + b.ne 9f // if (!fm || wd < 4) return; + mov x14, #(1 << 0) + ret +9: .if \wd >= 6 movi v10.16b, #1 uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0) @@ -474,20 +481,20 @@ function lpf_16_wd\wd\()_neon bif v11.16b, v29.16b, v15.16b // out q5 .endif + mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - ret x13 + mov x14, #(1 << 6) + ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - ret x14 + mov x14, #(1 << 4) + ret .endif -9: - // Return directly without writing back any pixels - ret x15 endfunc .endm @@ -497,22 +504,34 @@ loop_filter 6 loop_filter 4 .macro lpf_16_wd16 - adr x13, 7f - adr x14, 8f bl lpf_16_wd16_neon + cbz x14, 1f + tbnz x14, #6, 7f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_16_wd8 - adr x14, 8f bl lpf_16_wd8_neon + cbz x14, 1f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_16_wd6 bl lpf_16_wd6_neon + cbz x14, 1f + ret x15 +1: .endm .macro lpf_16_wd4 bl lpf_16_wd4_neon + cbz x14, 1f + ret x15 +1: .endm function lpf_v_4_16_neon diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S index aab0230c44b..d181a3e6239 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S @@ -28,6 +28,11 @@ #include "src/arm/asm.S" #include "util.S" +// depending on how many pixels need to be stored, returns: +// x14 = (1 << 0) : 0 pixels +// x14 = (1 << 4) : inner 4 pixels +// x14 = (1 << 6) : inner 6 pixels +// x14 = 0 : all pixels .macro loop_filter wd function lpf_8_wd\wd\()_neon uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0) @@ -77,8 +82,10 @@ function lpf_8_wd\wd\()_neon mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 - b.eq 9f // if (!fm || wd < 4) return; - + b.ne 9f // if (!fm || wd < 4) return; + mov x14, #(1 << 0) + ret +9: .if \wd >= 6 movi v10.8h, #1 uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) @@ -360,20 +367,20 @@ function lpf_8_wd\wd\()_neon bif v11.16b, v29.16b, v15.16b // out q5 .endif + mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - ret x13 + mov x14, #(1 << 6) + ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - ret x14 + mov x14, #(1 << 4) + ret .endif -9: - // Return directly without writing back any pixels - ret x15 endfunc .endm @@ -383,22 +390,34 @@ loop_filter 6 loop_filter 4 .macro lpf_8_wd16 - adr x13, 7f - adr x14, 8f bl lpf_8_wd16_neon + cbz x14, 1f + tbnz x14, #6, 7f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_8_wd8 - adr x14, 8f bl lpf_8_wd8_neon + cbz x14, 1f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_8_wd6 bl lpf_8_wd6_neon + cbz x14, 1f + ret x15 +1: .endm .macro lpf_8_wd4 bl lpf_8_wd4_neon + cbz x14, 1f + ret x15 +1: .endm function lpf_v_4_8_neon diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S index d1083c6b561..dc50415f1f1 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S +++ b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S @@ -135,6 +135,12 @@ #endif #define GNU_PROPERTY_AARCH64_PAC (1 << 1) +#elif defined(__APPLE__) && defined(__arm64e__) + +#define GNU_PROPERTY_AARCH64_PAC 0 +#define AARCH64_SIGN_LINK_REGISTER pacibsp +#define AARCH64_VALIDATE_LINK_REGISTER autibsp + #else /* __ARM_FEATURE_PAC_DEFAULT */ #define GNU_PROPERTY_AARCH64_PAC 0 diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h b/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h new file mode 100644 index 00000000000..2e8c8ab6fb8 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h @@ -0,0 +1,88 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/cdef.h" + +decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon)); + +void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src, + ptrdiff_t src_stride, const pixel (*left)[2], + const pixel *const top, + const pixel *const bottom, int h, + enum CdefEdgeFlags edges); +void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src, + ptrdiff_t src_stride, const pixel (*left)[2], + const pixel *const top, + const pixel *const bottom, int h, + enum CdefEdgeFlags edges); + +// Passing edges to this function, to allow it to switch to a more +// optimized version for fully edged cases. Using size_t for edges, +// to avoid ABI differences for passing more than one argument on the stack. +void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, int pri_strength, + int sec_strength, int dir, int damping, int h, + size_t edges HIGHBD_DECL_SUFFIX); +void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, int pri_strength, + int sec_strength, int dir, int damping, int h, + size_t edges HIGHBD_DECL_SUFFIX); + +#define DEFINE_FILTER(w, h, tmp_stride) \ +static void \ +cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const pixel *const bottom, \ + const int pri_strength, const int sec_strength, \ + const int dir, const int damping, \ + const enum CdefEdgeFlags edges \ + HIGHBD_DECL_SUFFIX) \ +{ \ + ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ + uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \ + BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, \ + left, top, bottom, h, edges); \ + BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \ + sec_strength, dir, damping, h, edges \ + HIGHBD_TAIL_SUFFIX); \ +} + +DEFINE_FILTER(8, 8, 16) +DEFINE_FILTER(4, 8, 8) +DEFINE_FILTER(4, 4, 8) + +static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->dir = BF(dav1d_cdef_find_dir, neon); + c->fb[0] = cdef_filter_8x8_neon; + c->fb[1] = cdef_filter_4x8_neon; + c->fb[2] = cdef_filter_4x4_neon; +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h new file mode 100644 index 00000000000..48776ac8524 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h @@ -0,0 +1,204 @@ +/* + * Copyright © 2018, Niklas Haas + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/filmgrain.h" +#include "asm-offsets.h" + +CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT); + +CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET); +CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE); + +void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data + HIGHBD_DECL_SUFFIX); + +#define GEN_GRAIN_UV(suff) \ +void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \ + const entry buf_y[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data, \ + const intptr_t uv \ + HIGHBD_DECL_SUFFIX) + +GEN_GRAIN_UV(420); +GEN_GRAIN_UV(422); +GEN_GRAIN_UV(444); + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// same layout of parameters on the stack across platforms. +void BF(dav1d_fgy_32x32, neon)(pixel *const dst, + const pixel *const src, + const ptrdiff_t stride, + const uint8_t scaling[SCALING_SIZE], + const int scaling_shift, + const entry grain_lut[][GRAIN_WIDTH], + const int offsets[][2], + const int h, const ptrdiff_t clip, + const ptrdiff_t type + HIGHBD_DECL_SUFFIX); + +static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, + const Dav1dFilmGrainData *const data, const size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], + const int bh, const int row_num HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { + + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + int type = 0; + if (data->overlap_flag && row_num) + type |= 1; /* overlap y */ + if (data->overlap_flag && bx) + type |= 2; /* overlap x */ + + BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride, + scaling, data->scaling_shift, + grain_lut, offsets, bh, + data->clip_to_restricted_range, type + HIGHBD_TAIL_SUFFIX); + } +} + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// parameters on the stack with the same layout across platforms. +#define FGUV(nm, sx, sy) \ +void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \ + const pixel *const src, \ + const ptrdiff_t stride, \ + const uint8_t scaling[SCALING_SIZE], \ + const Dav1dFilmGrainData *const data, \ + const entry grain_lut[][GRAIN_WIDTH], \ + const pixel *const luma_row, \ + const ptrdiff_t luma_stride, \ + const int offsets[][2], \ + const ptrdiff_t h, const ptrdiff_t uv, \ + const ptrdiff_t is_id, \ + const ptrdiff_t type \ + HIGHBD_DECL_SUFFIX); \ +static void \ +fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ + const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \ + const size_t pw, const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], const int bh, \ + const int row_num, const pixel *const luma_row, \ + const ptrdiff_t luma_stride, const int uv, const int is_id \ + HIGHBD_DECL_SUFFIX) \ +{ \ + const int rows = 1 + (data->overlap_flag && row_num > 0); \ + \ + /* seed[0] contains the current row, seed[1] contains the previous */ \ + unsigned seed[2]; \ + for (int i = 0; i < rows; i++) { \ + seed[i] = data->seed; \ + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \ + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \ + } \ + \ + int offsets[2 /* col offset */][2 /* row offset */]; \ + \ + /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \ + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \ + if (data->overlap_flag && bx) { \ + /* shift previous offsets left */ \ + for (int i = 0; i < rows; i++) \ + offsets[1][i] = offsets[0][i]; \ + } \ + \ + /* update current offsets */ \ + for (int i = 0; i < rows; i++) \ + offsets[0][i] = get_random_number(8, &seed[i]); \ + \ + int type = 0; \ + if (data->overlap_flag && row_num) \ + type |= 1; /* overlap y */ \ + if (data->overlap_flag && bx) \ + type |= 2; /* overlap x */ \ + if (data->chroma_scaling_from_luma) \ + type |= 4; \ + \ + BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \ + scaling, data, grain_lut, \ + luma_row + (bx << sx), luma_stride, \ + offsets, bh, uv, is_id, type \ + HIGHBD_TAIL_SUFFIX); \ + } \ +} + +FGUV(420, 1, 1); +FGUV(422, 1, 0); +FGUV(444, 0, 0); + +static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon); + + c->fgy_32x32xn = fgy_32x32xn_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon; +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h b/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h new file mode 100644 index 00000000000..aef4daebbf1 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h @@ -0,0 +1,80 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/ipred.h" + +decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon)); + +decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon)); + +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon)); + +decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); + +static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon); + c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon); + c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon); + c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon); + c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon); + c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon); + c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon); + c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon); + c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon); + c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon); + c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon); + + c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon); + c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon); + c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon); + c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon); + + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon); + + c->pal_pred = BF(dav1d_pal_pred, neon); +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/itx.h b/chromium/third_party/dav1d/libdav1d/src/arm/itx.h new file mode 100644 index 00000000000..2ecd086b3be --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/itx.h @@ -0,0 +1,141 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + +decl_itx17_fns( 4, 4, neon); +decl_itx16_fns( 4, 8, neon); +decl_itx16_fns( 4, 16, neon); +decl_itx16_fns( 8, 4, neon); +decl_itx16_fns( 8, 8, neon); +decl_itx16_fns( 8, 16, neon); +decl_itx2_fns ( 8, 32, neon); +decl_itx16_fns(16, 4, neon); +decl_itx16_fns(16, 8, neon); +decl_itx12_fns(16, 16, neon); +decl_itx2_fns (16, 32, neon); +decl_itx2_fns (32, 8, neon); +decl_itx2_fns (32, 16, neon); +decl_itx2_fns (32, 32, neon); + +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); + +static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) { +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + if (BITDEPTH == 16 && bpc != 10) return; + + assign_itx17_fn( , 4, 4, neon); + assign_itx16_fn(R, 4, 8, neon); + assign_itx16_fn(R, 4, 16, neon); + assign_itx16_fn(R, 8, 4, neon); + assign_itx16_fn( , 8, 8, neon); + assign_itx16_fn(R, 8, 16, neon); + assign_itx2_fn (R, 8, 32, neon); + assign_itx16_fn(R, 16, 4, neon); + assign_itx16_fn(R, 16, 8, neon); + assign_itx12_fn( , 16, 16, neon); + assign_itx2_fn (R, 16, 32, neon); + assign_itx1_fn (R, 16, 64, neon); + assign_itx2_fn (R, 32, 8, neon); + assign_itx2_fn (R, 32, 16, neon); + assign_itx2_fn ( , 32, 32, neon); + assign_itx1_fn (R, 32, 64, neon); + assign_itx1_fn (R, 64, 16, neon); + assign_itx1_fn (R, 64, 32, neon); + assign_itx1_fn ( , 64, 64, neon); +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c b/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h index a634da27c4e..9ac08d94d29 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c +++ b/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h @@ -1,5 +1,6 @@ /* - * Copyright © 2020, VideoLAN and dav1d authors + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,19 +26,20 @@ */ #include "src/cpu.h" -#include "src/msac.h" -#include "src/x86/msac.h" +#include "src/loopfilter.h" -#if ARCH_X86_64 -void dav1d_msac_init_x86(MsacContext *const s) { +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon)); +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon)); + +static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); - if (flags & DAV1D_X86_CPU_FLAG_SSE2) { - s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; - } + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; - if (flags & DAV1D_X86_CPU_FLAG_AVX2) { - s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; - } + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon); } -#endif diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h new file mode 100644 index 00000000000..7993dbff683 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h @@ -0,0 +1,265 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/looprestoration.h" + +#if ARCH_AARCH64 +void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +#else + +// The 8bpc version calculates things slightly differently than the reference +// C version. That version calculates roughly this: +// int16_t sum = 0; +// for (int i = 0; i < 7; i++) +// sum += src[idx] * fh[i]; +// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h; +// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h; +// sum += 1 << (bitdepth + 6 - round_bits_h); +// Compared to the reference C version, this is the output of the first pass +// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e. +// with round_offset precompensated. +// The 16bpc version calculates things pretty much the same way as the +// reference C version, but with the end result subtracted by +// 1 << (bitdepth + 6 - round_bits_h). +void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4], + const pixel *src, ptrdiff_t stride, + const int16_t fh[8], intptr_t w, + int h, enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +// This calculates things slightly differently than the reference C version. +// This version calculates roughly this: +// int32_t sum = 0; +// for (int i = 0; i < 7; i++) +// sum += mid[idx] * fv[i]; +// sum = (sum + rounding_off_v) >> round_bits_v; +// This function assumes that the width is a multiple of 8. +void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride, + const int16_t *mid, int w, int h, + const int16_t fv[8], enum LrEdgeFlags edges, + ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX); + +static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + const int16_t (*const filter)[8] = params->filter; + ALIGN_STK_16(int16_t, mid, 68 * 384,); + int mid_stride = (w + 7) & ~7; + + // Horizontal filter + BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride, + filter[0], w, h, edges HIGHBD_TAIL_SUFFIX); + if (edges & LR_HAVE_TOP) + BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride, + filter[0], w, 2, edges + HIGHBD_TAIL_SUFFIX); + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL, + lpf + 6 * PXSTRIDE(stride), + stride, filter[0], w, 2, edges + HIGHBD_TAIL_SUFFIX); + + // Vertical filter + BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride], + w, h, filter[1], edges, + mid_stride * sizeof(*mid) + HIGHBD_TAIL_SUFFIX); +} +#endif + +void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const ptrdiff_t stride, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, + const int w, const int h, const int strength, + const int bitdepth_max); +void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const int32_t *a, const int16_t *b, + const int w, const int h); + +/* filter with a 3x3 box (radius=1) */ +static void dav1d_sgr_filter1_neon(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, const int h, const int strength, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; + ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,); + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; + + BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges); + if (edges & LR_HAVE_TOP) + BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], + NULL, lpf, stride, w, 2, edges); + + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); + + dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges); + dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX); + BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h); +} + +void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const ptrdiff_t stride, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, + const int w, const int h, + const enum LrEdgeFlags edges); +void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, + const int w, const int h, const int strength, + const int bitdepth_max); +void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const int32_t *a, const int16_t *b, + const int w, const int h); + +/* filter with a 5x5 box (radius=2) */ +static void dav1d_sgr_filter2_neon(int16_t *tmp, + const pixel *src, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, const int h, const int strength, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); + int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; + ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,); + int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; + + BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges); + if (edges & LR_HAVE_TOP) + BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], + NULL, lpf, stride, w, 2, edges); + + if (edges & LR_HAVE_BOTTOM) + BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); + + dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges); + dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX); + BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h); +} + +void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int w, const int h, + const int wt HIGHBD_DECL_SUFFIX); +void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int16_t *t2, + const int w, const int h, + const int16_t wt[2] HIGHBD_DECL_SUFFIX); + +static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf, + w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, + tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX); +} + +static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf, + w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, + tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX); +} + +static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp1, 64 * 384,); + ALIGN_STK_16(int16_t, tmp2, 64 * 384,); + dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf, + w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); + dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf, + w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); + const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 }; + BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride, + tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX); +} + +static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + +#if ARCH_AARCH64 + c->wiener[0] = BF(dav1d_wiener_filter7, neon); + c->wiener[1] = BF(dav1d_wiener_filter5, neon); +#else + c->wiener[0] = c->wiener[1] = wiener_filter_neon; +#endif + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = sgr_filter_5x5_neon; + c->sgr[1] = sgr_filter_3x3_neon; + c->sgr[2] = sgr_filter_mix_neon; + } +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/mc.h b/chromium/third_party/dav1d/libdav1d/src/arm/mc.h new file mode 100644 index 00000000000..06cd533a9b4 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/arm/mc.h @@ -0,0 +1,114 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "src/mc.h" +#include "src/cpu.h" + +decl_mc_fn(BF(dav1d_put_8tap_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon)); +decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon)); +decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon)); +decl_mc_fn(BF(dav1d_put_bilin, neon)); + +decl_mct_fn(BF(dav1d_prep_8tap_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon)); +decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon)); +decl_mct_fn(BF(dav1d_prep_bilin, neon)); + +decl_avg_fn(BF(dav1d_avg, neon)); +decl_w_avg_fn(BF(dav1d_w_avg, neon)); +decl_mask_fn(BF(dav1d_mask, neon)); +decl_blend_fn(BF(dav1d_blend, neon)); +decl_blend_dir_fn(BF(dav1d_blend_h, neon)); +decl_blend_dir_fn(BF(dav1d_blend_v, neon)); + +decl_w_mask_fn(BF(dav1d_w_mask_444, neon)); +decl_w_mask_fn(BF(dav1d_w_mask_422, neon)); +decl_w_mask_fn(BF(dav1d_w_mask_420, neon)); + +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon)); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon)); + +decl_emu_edge_fn(BF(dav1d_emu_edge, neon)); + +static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = BF(dav1d_put_##name, suffix) +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = BF(dav1d_prep_##name, suffix) + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); + + c->avg = BF(dav1d_avg, neon); + c->w_avg = BF(dav1d_w_avg, neon); + c->mask = BF(dav1d_mask, neon); + c->blend = BF(dav1d_blend, neon); + c->blend_h = BF(dav1d_blend_h, neon); + c->blend_v = BF(dav1d_blend_v, neon); + c->w_mask[0] = BF(dav1d_w_mask_444, neon); + c->w_mask[1] = BF(dav1d_w_mask_422, neon); + c->w_mask[2] = BF(dav1d_w_mask_420, neon); + c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); + c->emu_edge = BF(dav1d_emu_edge, neon); +} diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c b/chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h index acde030a368..4c96fc50952 100644 --- a/chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c +++ b/chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h @@ -30,7 +30,7 @@ decl_splat_mv_fn(dav1d_splat_mv_neon); -COLD void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { +static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/chromium/third_party/dav1d/libdav1d/src/cdef.h b/chromium/third_party/dav1d/libdav1d/src/cdef.h index 2a933d54ef9..07c84d9ff50 100644 --- a/chromium/third_party/dav1d/libdav1d/src/cdef.h +++ b/chromium/third_party/dav1d/libdav1d/src/cdef.h @@ -67,8 +67,5 @@ typedef struct Dav1dCdefDSPContext { } Dav1dCdefDSPContext; bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c); -bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c); -bitfn_decls(void dav1d_cdef_dsp_init_ppc, Dav1dCdefDSPContext *c); -bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c); #endif /* DAV1D_SRC_CDEF_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/cdef_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/cdef_tmpl.c index 1c95dbf9141..59439457a18 100644 --- a/chromium/third_party/dav1d/libdav1d/src/cdef_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/cdef_tmpl.c @@ -303,6 +303,16 @@ static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride, return best_dir; } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/cdef.h" +#elif ARCH_PPC64LE +#include "src/ppc/cdef.h" +#elif ARCH_X86 +#include "src/x86/cdef.h" +#endif +#endif + COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) { c->dir = cdef_find_dir_c; c->fb[0] = cdef_filter_block_8x8_c; @@ -311,11 +321,11 @@ COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_cdef_dsp_init_arm)(c); + cdef_dsp_init_arm(c); #elif ARCH_PPC64LE - bitfn(dav1d_cdef_dsp_init_ppc)(c); + cdef_dsp_init_ppc(c); #elif ARCH_X86 - bitfn(dav1d_cdef_dsp_init_x86)(c); + cdef_dsp_init_x86(c); #endif #endif } diff --git a/chromium/third_party/dav1d/libdav1d/src/cdf.c b/chromium/third_party/dav1d/libdav1d/src/cdf.c index 8ac87fe0354..e0f2132e007 100644 --- a/chromium/third_party/dav1d/libdav1d/src/cdf.c +++ b/chromium/third_party/dav1d/libdav1d/src/cdf.c @@ -4118,7 +4118,6 @@ void dav1d_cdf_thread_ref(CdfThreadContext *const dst, } void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) { - if (cdf->ref) - dav1d_ref_dec(&cdf->ref); - memset(cdf, 0, sizeof(*cdf)); + memset(&cdf->data, 0, sizeof(*cdf) - offsetof(CdfThreadContext, data)); + dav1d_ref_dec(&cdf->ref); } diff --git a/chromium/third_party/dav1d/libdav1d/src/cpu.c b/chromium/third_party/dav1d/libdav1d/src/cpu.c index 2e5e8d9036e..d24148c352e 100644 --- a/chromium/third_party/dav1d/libdav1d/src/cpu.c +++ b/chromium/third_party/dav1d/libdav1d/src/cpu.c @@ -48,28 +48,24 @@ #define cpu_set_t cpuset_t #endif -static unsigned flags = 0; -static unsigned flags_mask = -1; +unsigned dav1d_cpu_flags = 0U; +unsigned dav1d_cpu_flags_mask = ~0U; COLD void dav1d_init_cpu(void) { #if HAVE_ASM && !__has_feature(memory_sanitizer) // memory sanitizer is inherently incompatible with asm #if ARCH_AARCH64 || ARCH_ARM - flags = dav1d_get_cpu_flags_arm(); + dav1d_cpu_flags = dav1d_get_cpu_flags_arm(); #elif ARCH_PPC64LE - flags = dav1d_get_cpu_flags_ppc(); + dav1d_cpu_flags = dav1d_get_cpu_flags_ppc(); #elif ARCH_X86 - flags = dav1d_get_cpu_flags_x86(); + dav1d_cpu_flags = dav1d_get_cpu_flags_x86(); #endif #endif } -COLD unsigned dav1d_get_cpu_flags(void) { - return flags & flags_mask; -} - COLD void dav1d_set_cpu_flags_mask(const unsigned mask) { - flags_mask = mask; + dav1d_cpu_flags_mask = mask; } COLD int dav1d_num_logical_processors(Dav1dContext *const c) { @@ -99,6 +95,7 @@ COLD int dav1d_num_logical_processors(Dav1dContext *const c) { #elif defined(_SC_NPROCESSORS_ONLN) return (int)sysconf(_SC_NPROCESSORS_ONLN); #endif - dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n"); + if (c) + dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n"); return 1; } diff --git a/chromium/third_party/dav1d/libdav1d/src/cpu.h b/chromium/third_party/dav1d/libdav1d/src/cpu.h index b5c27f7a216..8f70fefe54f 100644 --- a/chromium/third_party/dav1d/libdav1d/src/cpu.h +++ b/chromium/third_party/dav1d/libdav1d/src/cpu.h @@ -1,6 +1,6 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC + * Copyright © 2018-2022, VideoLAN and dav1d authors + * Copyright © 2018-2022, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -43,9 +43,60 @@ #include "src/x86/cpu.h" #endif +EXTERN unsigned dav1d_cpu_flags; +EXTERN unsigned dav1d_cpu_flags_mask; + void dav1d_init_cpu(void); -unsigned dav1d_get_cpu_flags(void); DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask); int dav1d_num_logical_processors(Dav1dContext *c); +static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) { + unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask; + +#if TRIM_DSP_FUNCTIONS +/* Since this function is inlined, unconditionally setting a flag here will + * enable dead code elimination in the calling function. */ +#if ARCH_AARCH64 || ARCH_ARM +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 + flags |= DAV1D_ARM_CPU_FLAG_NEON; +#endif +#elif ARCH_PPC64LE +#if defined(__VSX__) + flags |= DAV1D_PPC_CPU_FLAG_VSX; +#endif +#elif ARCH_X86 +#if defined(__AVX512F__) && defined(__AVX512CD__) && \ + defined(__AVX512BW__) && defined(__AVX512DQ__) && \ + defined(__AVX512VL__) && defined(__AVX512VNNI__) && \ + defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \ + defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \ + defined(__AVX512BITALG__) && defined(__GFNI__) && \ + defined(__VAES__) && defined(__VPCLMULQDQ__) + flags |= DAV1D_X86_CPU_FLAG_AVX512ICL | + DAV1D_X86_CPU_FLAG_AVX2 | + DAV1D_X86_CPU_FLAG_SSE41 | + DAV1D_X86_CPU_FLAG_SSSE3 | + DAV1D_X86_CPU_FLAG_SSE2; +#elif defined(__AVX2__) + flags |= DAV1D_X86_CPU_FLAG_AVX2 | + DAV1D_X86_CPU_FLAG_SSE41 | + DAV1D_X86_CPU_FLAG_SSSE3 | + DAV1D_X86_CPU_FLAG_SSE2; +#elif defined(__SSE4_1__) || defined(__AVX__) + flags |= DAV1D_X86_CPU_FLAG_SSE41 | + DAV1D_X86_CPU_FLAG_SSSE3 | + DAV1D_X86_CPU_FLAG_SSE2; +#elif defined(__SSSE3__) + flags |= DAV1D_X86_CPU_FLAG_SSSE3 | + DAV1D_X86_CPU_FLAG_SSE2; +#elif ARCH_X86_64 || defined(__SSE2__) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 2) + flags |= DAV1D_X86_CPU_FLAG_SSE2; +#endif +#endif +#endif + + return flags; +} + #endif /* DAV1D_SRC_CPU_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/data.c b/chromium/third_party/dav1d/libdav1d/src/data.c index fa6165ec721..8a1386ad95a 100644 --- a/chromium/third_party/dav1d/libdav1d/src/data.c +++ b/chromium/third_party/dav1d/libdav1d/src/data.c @@ -47,8 +47,9 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) { buf->ref = dav1d_ref_create(sz); if (!buf->ref) return NULL; buf->data = buf->ref->const_data; - buf->sz = buf->m.size = sz; + buf->sz = sz; dav1d_data_props_set_defaults(&buf->m); + buf->m.size = sz; return buf->ref->data; } @@ -66,8 +67,9 @@ int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr, buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie); if (!buf->ref) return DAV1D_ERR(ENOMEM); buf->data = ptr; - buf->sz = buf->m.size = sz; + buf->sz = sz; dav1d_data_props_set_defaults(&buf->m); + buf->m.size = sz; return 0; } diff --git a/chromium/third_party/dav1d/libdav1d/src/decode.c b/chromium/third_party/dav1d/libdav1d/src/decode.c index 13d57060710..2c816338a9e 100644 --- a/chromium/third_party/dav1d/libdav1d/src/decode.c +++ b/chromium/third_party/dav1d/libdav1d/src/decode.c @@ -749,9 +749,9 @@ static inline void splat_intraref(const Dav1dContext *const c, c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4); } -static inline void mc_lowest_px(int *const dst, const int by4, const int bh4, - const int mvy, const int ss_ver, - const struct ScalableMotionParams *const smp) +static void mc_lowest_px(int *const dst, const int by4, const int bh4, + const int mvy, const int ss_ver, + const struct ScalableMotionParams *const smp) { const int v_mul = 4 >> ss_ver; if (!smp->scale) { @@ -766,14 +766,11 @@ static inline void mc_lowest_px(int *const dst, const int by4, const int bh4, } } -static inline void affine_lowest_px(Dav1dTaskContext *const t, - int *const dst, const int is_chroma, - const uint8_t *const b_dim, - const Dav1dWarpedMotionParams *const wmp) +static ALWAYS_INLINE void affine_lowest_px(Dav1dTaskContext *const t, int *const dst, + const uint8_t *const b_dim, + const Dav1dWarpedMotionParams *const wmp, + const int ss_ver, const int ss_hor) { - const Dav1dFrameContext *const f = t->f; - const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; - const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7)); const int32_t *const mat = wmp->matrix; @@ -792,6 +789,25 @@ static inline void affine_lowest_px(Dav1dTaskContext *const t, } } +static NOINLINE void affine_lowest_px_luma(Dav1dTaskContext *const t, int *const dst, + const uint8_t *const b_dim, + const Dav1dWarpedMotionParams *const wmp) +{ + affine_lowest_px(t, dst, b_dim, wmp, 0, 0); +} + +static NOINLINE void affine_lowest_px_chroma(Dav1dTaskContext *const t, int *const dst, + const uint8_t *const b_dim, + const Dav1dWarpedMotionParams *const wmp) +{ + const Dav1dFrameContext *const f = t->f; + assert(f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400); + if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I444) + affine_lowest_px_luma(t, dst, b_dim, wmp); + else + affine_lowest_px(t, dst, b_dim, wmp, f->cur.p.layout & DAV1D_PIXEL_LAYOUT_I420, 1); +} + static void obmc_lowest_px(Dav1dTaskContext *const t, int (*const dst)[2], const int is_chroma, const uint8_t *const b_dim, @@ -2071,11 +2087,14 @@ static int decode_b(Dav1dTaskContext *const t, const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2]) &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv]; const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; + enum RectTxfmSize ytx = b->max_ytx, uvtx = b->uvtx; + if (f->frame_hdr->segmentation.lossless[b->seg_id]) { + ytx = (enum RectTxfmSize) TX_4X4; + uvtx = (enum RectTxfmSize) TX_4X4; + } dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls, t->bx, t->by, f->w4, f->h4, b->skip, bs, - f->frame_hdr->segmentation.lossless[b->seg_id] ? - (enum RectTxfmSize) TX_4X4 : b->max_ytx, - tx_split, b->uvtx, f->cur.p.layout, + ytx, tx_split, uvtx, f->cur.p.layout, &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4], has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL, has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL); @@ -2150,9 +2169,9 @@ static int decode_b(Dav1dTaskContext *const t, ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) { - affine_lowest_px(t, &lowest_px[b->ref[0]][0], 0, b_dim, - b->motion_mode == MM_WARP ? &t->warpmv : - &f->frame_hdr->gmv[b->ref[0]]); + affine_lowest_px_luma(t, &lowest_px[b->ref[0]][0], b_dim, + b->motion_mode == MM_WARP ? &t->warpmv : + &f->frame_hdr->gmv[b->ref[0]]); } else { mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y, 0, &f->svc[b->ref[0]][1]); @@ -2203,9 +2222,9 @@ static int decode_b(Dav1dTaskContext *const t, ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) { - affine_lowest_px(t, &lowest_px[b->ref[0]][1], 1, b_dim, - b->motion_mode == MM_WARP ? &t->warpmv : - &f->frame_hdr->gmv[b->ref[0]]); + affine_lowest_px_chroma(t, &lowest_px[b->ref[0]][1], b_dim, + b->motion_mode == MM_WARP ? &t->warpmv : + &f->frame_hdr->gmv[b->ref[0]]); } else { mc_lowest_px(&lowest_px[b->ref[0]][1], t->by & ~ss_ver, bh4 << (bh4 == ss_ver), @@ -2220,8 +2239,8 @@ static int decode_b(Dav1dTaskContext *const t, // y for (int i = 0; i < 2; i++) { if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) { - affine_lowest_px(t, &lowest_px[b->ref[i]][0], 0, b_dim, - &f->frame_hdr->gmv[b->ref[i]]); + affine_lowest_px_luma(t, &lowest_px[b->ref[i]][0], b_dim, + &f->frame_hdr->gmv[b->ref[i]]); } else { mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4, b->mv[i].y, 0, &f->svc[b->ref[i]][1]); @@ -2233,8 +2252,8 @@ static int decode_b(Dav1dTaskContext *const t, if (b->inter_mode == GLOBALMV_GLOBALMV && imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]]) { - affine_lowest_px(t, &lowest_px[b->ref[i]][1], 1, b_dim, - &f->frame_hdr->gmv[b->ref[i]]); + affine_lowest_px_chroma(t, &lowest_px[b->ref[i]][1], b_dim, + &f->frame_hdr->gmv[b->ref[i]]); } else { mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4, b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]); @@ -3407,7 +3426,7 @@ void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) { (size_t)f->frame_thread.cf_sz * 128 * 128 / 2); } for (int i = 0; i < 7; i++) { - if (f->refp[i].p.data[0]) + if (f->refp[i].p.frame_hdr) dav1d_thread_picture_unref(&f->refp[i]); dav1d_ref_dec(&f->ref_mvs_ref[i]); } @@ -3440,13 +3459,12 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { // wait until all threads have completed if (!res) { if (f->c->n_tc > 1) { - pthread_mutex_lock(&f->task_thread.ttd->lock); res = dav1d_task_create_tile_sbrow(f, 0, 1); + pthread_mutex_lock(&f->task_thread.ttd->lock); + pthread_cond_signal(&f->task_thread.ttd->cond); if (!res) { - const int uses_2pass = f->c->n_fc > 1; while (!f->task_thread.done[0] || - (uses_2pass && !f->task_thread.done[1]) || - f->task_thread.task_counter > 0) + atomic_load(&f->task_thread.task_counter) > 0) { pthread_cond_wait(&f->task_thread.cond, &f->task_thread.ttd->lock); @@ -3469,7 +3487,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { static int get_upscale_x0(const int in_w, const int out_w, const int step) { const int err = out_w * step - (in_w << 14); - const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1); + const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err / 2); return x0 & 0x3fff; } @@ -3491,10 +3509,13 @@ int dav1d_submit_frame(Dav1dContext *const c) { &c->task_thread.lock); out_delayed = &c->frame_thread.out_delayed[next]; if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) { - if (atomic_load(&c->task_thread.first) + 1U < c->n_fc) + unsigned first = atomic_load(&c->task_thread.first); + if (first + 1U < c->n_fc) atomic_fetch_add(&c->task_thread.first, 1U); else atomic_store(&c->task_thread.first, 0); + atomic_compare_exchange_strong(&c->task_thread.reset_task_cur, + &first, UINT_MAX); if (c->task_thread.cur && c->task_thread.cur < c->n_fc) c->task_thread.cur--; } @@ -3706,7 +3727,8 @@ int dav1d_submit_frame(Dav1dContext *const c) { const int uses_2pass = c->n_fc > 1; const int cols = f->frame_hdr->tiling.cols; const int rows = f->frame_hdr->tiling.rows; - f->task_thread.task_counter = (cols * rows + f->sbh) << uses_2pass; + atomic_store(&f->task_thread.task_counter, + (cols * rows + f->sbh) << uses_2pass); // ref_mvs if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { @@ -3726,9 +3748,10 @@ int dav1d_submit_frame(Dav1dContext *const c) { if (f->frame_hdr->use_ref_frame_mvs) { for (int i = 0; i < 7; i++) { const int refidx = f->frame_hdr->refidx[i]; + const int ref_w = ((ref_coded_width[i] + 7) >> 3) << 1; + const int ref_h = ((f->refp[i].p.p.h + 7) >> 3) << 1; if (c->refs[refidx].refmvs != NULL && - ref_coded_width[i] == f->cur.p.w && - f->refp[i].p.p.h == f->cur.p.h) + ref_w == f->bw && ref_h == f->bh) { f->ref_mvs_ref[i] = c->refs[refidx].refmvs; dav1d_ref_inc(f->ref_mvs_ref[i]); @@ -3809,7 +3832,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags; for (int i = 0; i < 8; i++) { if (refresh_frame_flags & (1 << i)) { - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur); @@ -3839,7 +3862,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { dav1d_thread_picture_unref(&c->out); for (int i = 0; i < 8; i++) { if (refresh_frame_flags & (1 << i)) { - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_cdf_thread_unref(&c->cdf[i]); dav1d_ref_dec(&c->refs[i].segmap); @@ -3860,7 +3883,7 @@ error: if (f->frame_hdr->refresh_context) dav1d_cdf_thread_unref(&f->out_cdf); for (int i = 0; i < 7; i++) { - if (f->refp[i].p.data[0]) + if (f->refp[i].p.frame_hdr) dav1d_thread_picture_unref(&f->refp[i]); dav1d_ref_dec(&f->ref_mvs_ref[i]); } diff --git a/chromium/third_party/dav1d/libdav1d/src/dequant_tables.h b/chromium/third_party/dav1d/libdav1d/src/dequant_tables.h index 4f555957130..17763377bc9 100644 --- a/chromium/third_party/dav1d/libdav1d/src/dequant_tables.h +++ b/chromium/third_party/dav1d/libdav1d/src/dequant_tables.h @@ -32,6 +32,6 @@ #include "src/levels.h" -extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2]; +EXTERN const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2]; #endif /* DAV1D_SRC_DEQUANT_TABLES_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/fg_apply_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/fg_apply_tmpl.c index ee14db9a4ce..581bcb72f5d 100644 --- a/chromium/third_party/dav1d/libdav1d/src/fg_apply_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/fg_apply_tmpl.c @@ -51,6 +51,11 @@ static void generate_scaling(const int bitdepth, const int scaling_size = 1 << bitdepth; #endif + if (num == 0) { + memset(scaling, 0, scaling_size); + return; + } + // Fill up the preceding entries with the initial value memset(scaling, points[0][1], points[0][0] << shift_x); @@ -113,7 +118,7 @@ void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp, data, 1 HIGHBD_TAIL_SUFFIX); // Generate scaling LUTs as needed - if (data->num_y_points) + if (data->num_y_points || data->chroma_scaling_from_luma) generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]); if (data->num_uv_points[0]) generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]); diff --git a/chromium/third_party/dav1d/libdav1d/src/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/filmgrain.h index d953542a82a..a5d6be6d44f 100644 --- a/chromium/third_party/dav1d/libdav1d/src/filmgrain.h +++ b/chromium/third_party/dav1d/libdav1d/src/filmgrain.h @@ -64,7 +64,7 @@ typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn); #define decl_fguv_32x32xn_fn(name) \ void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ - const Dav1dFilmGrainData *data, int pw, \ + const Dav1dFilmGrainData *data, size_t pw, \ const uint8_t scaling[SCALING_SIZE], \ const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \ const pixel *luma_row, ptrdiff_t luma_stride, \ @@ -80,7 +80,5 @@ typedef struct Dav1dFilmGrainDSPContext { } Dav1dFilmGrainDSPContext; bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c); -bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c); -bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c); #endif /* DAV1D_SRC_FILM_GRAIN_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/filmgrain_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/filmgrain_tmpl.c index 883c5cbb7b9..0986ac2a58c 100644 --- a/chromium/third_party/dav1d/libdav1d/src/filmgrain_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/filmgrain_tmpl.c @@ -278,7 +278,7 @@ static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row, static NOINLINE void fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row, const ptrdiff_t stride, const Dav1dFilmGrainData *const data, - const int pw, const uint8_t scaling[SCALING_SIZE], + const size_t pw, const uint8_t scaling[SCALING_SIZE], const entry grain_lut[][GRAIN_WIDTH], const int bh, const int row_num, const pixel *const luma_row, const ptrdiff_t luma_stride, const int uv, const int is_id, @@ -311,8 +311,8 @@ fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row, int offsets[2 /* col offset */][2 /* row offset */]; // process this row in BLOCK_SIZE^2 blocks (subsampled) - for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { - const int bw = imin(BLOCK_SIZE >> sx, pw - bx); + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { + const int bw = imin(BLOCK_SIZE >> sx, (int)(pw - bx)); if (data->overlap_flag && bx) { // shift previous offsets left for (int i = 0; i < rows; i++) @@ -412,6 +412,14 @@ fguv_ss_fn(420, 1, 1); fguv_ss_fn(422, 1, 0); fguv_ss_fn(444, 0, 0); +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/filmgrain.h" +#elif ARCH_X86 +#include "src/x86/filmgrain.h" +#endif +#endif + COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) { c->generate_grain_y = generate_grain_y_c; c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c; @@ -425,9 +433,9 @@ COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_film_grain_dsp_init_arm)(c); + film_grain_dsp_init_arm(c); #elif ARCH_X86 - bitfn(dav1d_film_grain_dsp_init_x86)(c); + film_grain_dsp_init_x86(c); #endif #endif } diff --git a/chromium/third_party/dav1d/libdav1d/src/getbits.c b/chromium/third_party/dav1d/libdav1d/src/getbits.c index 7bb20140e41..673070be3dd 100644 --- a/chromium/third_party/dav1d/libdav1d/src/getbits.c +++ b/chromium/third_party/dav1d/libdav1d/src/getbits.c @@ -36,51 +36,62 @@ void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data, const size_t sz) { - // If sz were 0, c->eof would need to be initialized to 1. assert(sz); c->ptr = c->ptr_start = data; c->ptr_end = &c->ptr_start[sz]; - c->bits_left = 0; c->state = 0; + c->bits_left = 0; c->error = 0; - c->eof = 0; } -static void refill(GetBits *const c, const unsigned n) { - assert(c->bits_left <= 56); - uint64_t state = 0; - do { - state <<= 8; - c->bits_left += 8; - if (!c->eof) - state |= *c->ptr++; +unsigned dav1d_get_bit(GetBits *const c) { + if (!c->bits_left) { if (c->ptr >= c->ptr_end) { - c->error = c->eof; - c->eof = 1; + c->error = 1; + } else { + const unsigned state = *c->ptr++; + c->bits_left = 7; + c->state = (uint64_t) state << 57; + return state >> 7; } - } while (n > c->bits_left); - c->state |= state << (64 - c->bits_left); -} - -unsigned dav1d_get_bits(GetBits *const c, const unsigned n) { - assert(n <= 32 /* can go up to 57 if we change return type */); - assert(n /* can't shift state by 64 */); - - if (n > c->bits_left) refill(c, n); + } const uint64_t state = c->state; - c->bits_left -= n; - c->state <<= n; + c->bits_left--; + c->state = state << 1; + return (unsigned) (state >> 63); +} - return (unsigned) (state >> (64 - n)); +static inline void refill(GetBits *const c, const int n) { + assert(c->bits_left >= 0 && c->bits_left < 32); + unsigned state = 0; + do { + if (c->ptr >= c->ptr_end) { + c->error = 1; + if (state) break; + return; + } + state = (state << 8) | *c->ptr++; + c->bits_left += 8; + } while (n > c->bits_left); + c->state |= (uint64_t) state << (64 - c->bits_left); } -int dav1d_get_sbits(GetBits *const c, const unsigned n) { - const int shift = 31 - n; - const int res = dav1d_get_bits(c, n + 1) << shift; - return res >> shift; +#define GET_BITS(name, type, type64) \ +type name(GetBits *const c, const int n) { \ + assert(n > 0 && n <= 32); \ + /* Unsigned cast avoids refill after eob */ \ + if ((unsigned) n > (unsigned) c->bits_left) \ + refill(c, n); \ + const uint64_t state = c->state; \ + c->bits_left -= n; \ + c->state = state << n; \ + return (type) ((type64) state >> (64 - n)); \ } +GET_BITS(dav1d_get_bits, unsigned, uint64_t) +GET_BITS(dav1d_get_sbits, int, int64_t) + unsigned dav1d_get_uleb128(GetBits *const c) { uint64_t val = 0; unsigned i = 0, more; @@ -108,15 +119,20 @@ unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) { assert(l > 1); const unsigned m = (1U << l) - max; const unsigned v = dav1d_get_bits(c, l - 1); - return v < m ? v : (v << 1) - m + dav1d_get_bits(c, 1); + return v < m ? v : (v << 1) - m + dav1d_get_bit(c); } unsigned dav1d_get_vlc(GetBits *const c) { + if (dav1d_get_bit(c)) + return 0; + int n_bits = 0; - while (!dav1d_get_bits(c, 1)) + do { if (++n_bits == 32) return 0xFFFFFFFFU; - return n_bits ? ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits) : 0; + } while (!dav1d_get_bit(c)); + + return ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits); } static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref, @@ -132,7 +148,7 @@ static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref, break; } - if (!dav1d_get_bits(c, 1)) { + if (!dav1d_get_bit(c)) { v += dav1d_get_bits(c, b); break; } diff --git a/chromium/third_party/dav1d/libdav1d/src/getbits.h b/chromium/third_party/dav1d/libdav1d/src/getbits.h index fc382148b2e..57b80dc7143 100644 --- a/chromium/third_party/dav1d/libdav1d/src/getbits.h +++ b/chromium/third_party/dav1d/libdav1d/src/getbits.h @@ -32,15 +32,15 @@ #include <stdint.h> typedef struct GetBits { - int error, eof; uint64_t state; - unsigned bits_left; + int bits_left, error; const uint8_t *ptr, *ptr_start, *ptr_end; } GetBits; void dav1d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz); -unsigned dav1d_get_bits(GetBits *c, unsigned n); -int dav1d_get_sbits(GetBits *c, unsigned n); +unsigned dav1d_get_bit(GetBits *c); +unsigned dav1d_get_bits(GetBits *c, int n); +int dav1d_get_sbits(GetBits *c, int n); unsigned dav1d_get_uleb128(GetBits *c); // Output in range 0..max-1 diff --git a/chromium/third_party/dav1d/libdav1d/src/internal.h b/chromium/third_party/dav1d/libdav1d/src/internal.h index eceda98eca4..b5fd1e18ef3 100644 --- a/chromium/third_party/dav1d/libdav1d/src/internal.h +++ b/chromium/third_party/dav1d/libdav1d/src/internal.h @@ -194,6 +194,7 @@ struct Dav1dContext { int strict_std_compliance; int output_invisible_frames; enum Dav1dInloopFilterType inloop_filters; + enum Dav1dDecodeFrameType decode_frame_type; int drain; enum PictureFlags frame_flags; enum Dav1dEventFlags event_flags; @@ -275,7 +276,7 @@ struct Dav1dFrameContext { struct { int next_tile_row[2 /* 0: reconstruction, 1: entropy */]; - int entropy_progress; + atomic_int entropy_progress; atomic_int deblock_progress; // in sby units atomic_uint *frame_progress, *copy_lpf_progress; // indexed using t->by * f->b4_stride + t->bx @@ -324,22 +325,28 @@ struct Dav1dFrameContext { } lf; struct { + pthread_mutex_t lock; pthread_cond_t cond; struct TaskThreadData *ttd; struct Dav1dTask *tasks, *tile_tasks[2], init_task; int num_tasks, num_tile_tasks; - int init_done; - int done[2]; + atomic_int init_done; + atomic_int done[2]; int retval; int update_set; // whether we need to update CDF reference atomic_int error; - int task_counter; + atomic_int task_counter; struct Dav1dTask *task_head, *task_tail; // Points to the task directly before the cur pointer in the queue. // This cur pointer is theoretical here, we actually keep track of the // "prev_t" variable. This is needed to not loose the tasks in // [head;cur-1] when picking one for execution. struct Dav1dTask *task_cur_prev; + struct { // async task insertion + atomic_int merge; + pthread_mutex_t lock; + Dav1dTask *head, *tail; + } pending_tasks; } task_thread; // threading (refer to tc[] for per-thread things) diff --git a/chromium/third_party/dav1d/libdav1d/src/ipred.h b/chromium/third_party/dav1d/libdav1d/src/ipred.h index 8664f3f993c..739ef1a266f 100644 --- a/chromium/third_party/dav1d/libdav1d/src/ipred.h +++ b/chromium/third_party/dav1d/libdav1d/src/ipred.h @@ -90,7 +90,5 @@ typedef struct Dav1dIntraPredDSPContext { } Dav1dIntraPredDSPContext; bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c); -bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c); -bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c); #endif /* DAV1D_SRC_IPRED_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/ipred_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/ipred_tmpl.c index 50c7a3c7bee..151d4842a04 100644 --- a/chromium/third_party/dav1d/libdav1d/src/ipred_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/ipred_tmpl.c @@ -726,6 +726,14 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride, } } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/ipred.h" +#elif ARCH_X86 +#include "src/x86/ipred.h" +#endif +#endif + COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) { c->intra_pred[DC_PRED ] = ipred_dc_c; c->intra_pred[DC_128_PRED ] = ipred_dc_128_c; @@ -755,9 +763,9 @@ COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_intra_pred_dsp_init_arm)(c); + intra_pred_dsp_init_arm(c); #elif ARCH_X86 - bitfn(dav1d_intra_pred_dsp_init_x86)(c); + intra_pred_dsp_init_x86(c); #endif #endif } diff --git a/chromium/third_party/dav1d/libdav1d/src/itx.h b/chromium/third_party/dav1d/libdav1d/src/itx.h index 08f5e212853..d522079907e 100644 --- a/chromium/third_party/dav1d/libdav1d/src/itx.h +++ b/chromium/third_party/dav1d/libdav1d/src/itx.h @@ -44,7 +44,5 @@ typedef struct Dav1dInvTxfmDSPContext { } Dav1dInvTxfmDSPContext; bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc); -bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc); -bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c, int bpc); #endif /* DAV1D_SRC_ITX_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c index 2f97a9cd798..d3859892d8b 100644 --- a/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c @@ -180,6 +180,14 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, dst[x] = iclip_pixel(dst[x] + *c++); } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/itx.h" +#elif ARCH_X86 +#include "src/x86/itx.h" +#endif +#endif + COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { #define assign_itx_all_fn64(w, h, pfx) \ c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \ @@ -247,10 +255,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_itx_dsp_init_arm)(c, bpc); + itx_dsp_init_arm(c, bpc); #endif #if ARCH_X86 - bitfn(dav1d_itx_dsp_init_x86)(c, bpc); + itx_dsp_init_x86(c, bpc); #endif #endif } diff --git a/chromium/third_party/dav1d/libdav1d/src/lf_mask.c b/chromium/third_party/dav1d/libdav1d/src/lf_mask.c index 411c88400e7..91fe4a02c8d 100644 --- a/chromium/third_party/dav1d/libdav1d/src/lf_mask.c +++ b/chromium/third_party/dav1d/libdav1d/src/lf_mask.c @@ -212,13 +212,13 @@ static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2], #undef set_ctx } -static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2], - const int cby4, const int cbx4, - const int cw4, const int ch4, - const int skip_inter, - const enum RectTxfmSize tx, - uint8_t *const a, uint8_t *const l, - const int ss_hor, const int ss_ver) +static void mask_edges_chroma(uint16_t (*const masks)[32][2][2], + const int cby4, const int cbx4, + const int cw4, const int ch4, + const int skip_inter, + const enum RectTxfmSize tx, + uint8_t *const a, uint8_t *const l, + const int ss_hor, const int ss_ver) { const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; const int twl4 = t_dim->lw, thl4 = t_dim->lh; @@ -424,16 +424,14 @@ void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) { lim_lut->sharp[1] = sharp ? 9 - sharp : 0xff; } -static inline void calc_lf_value(uint8_t (*const lflvl_values)[2], - const int is_chroma, const int base_lvl, - const int lf_delta, const int seg_delta, - const Dav1dLoopfilterModeRefDeltas *const mr_delta) +static void calc_lf_value(uint8_t (*const lflvl_values)[2], + const int base_lvl, const int lf_delta, + const int seg_delta, + const Dav1dLoopfilterModeRefDeltas *const mr_delta) { const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63); - if (!base_lvl && is_chroma) { - memset(lflvl_values, 0, 8 * 2); - } else if (!mr_delta) { + if (!mr_delta) { memset(lflvl_values, base, 8 * 2); } else { const int sh = base >= 32; @@ -449,6 +447,17 @@ static inline void calc_lf_value(uint8_t (*const lflvl_values)[2], } } +static inline void calc_lf_value_chroma(uint8_t (*const lflvl_values)[2], + const int base_lvl, const int lf_delta, + const int seg_delta, + const Dav1dLoopfilterModeRefDeltas *const mr_delta) +{ + if (!base_lvl) + memset(lflvl_values, 0, 8 * 2); + else + calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta); +} + void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2], const Dav1dFrameHeader *const hdr, const int8_t lf_delta[4]) @@ -467,16 +476,16 @@ void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2], const Dav1dSegmentationData *const segd = hdr->segmentation.enabled ? &hdr->segmentation.seg_data.d[s] : NULL; - calc_lf_value(lflvl_values[s][0], 0, hdr->loopfilter.level_y[0], + calc_lf_value(lflvl_values[s][0], hdr->loopfilter.level_y[0], lf_delta[0], segd ? segd->delta_lf_y_v : 0, mr_deltas); - calc_lf_value(lflvl_values[s][1], 0, hdr->loopfilter.level_y[1], + calc_lf_value(lflvl_values[s][1], hdr->loopfilter.level_y[1], lf_delta[hdr->delta.lf.multi ? 1 : 0], segd ? segd->delta_lf_y_h : 0, mr_deltas); - calc_lf_value(lflvl_values[s][2], 1, hdr->loopfilter.level_u, - lf_delta[hdr->delta.lf.multi ? 2 : 0], - segd ? segd->delta_lf_u : 0, mr_deltas); - calc_lf_value(lflvl_values[s][3], 1, hdr->loopfilter.level_v, - lf_delta[hdr->delta.lf.multi ? 3 : 0], - segd ? segd->delta_lf_v : 0, mr_deltas); + calc_lf_value_chroma(lflvl_values[s][2], hdr->loopfilter.level_u, + lf_delta[hdr->delta.lf.multi ? 2 : 0], + segd ? segd->delta_lf_u : 0, mr_deltas); + calc_lf_value_chroma(lflvl_values[s][3], hdr->loopfilter.level_v, + lf_delta[hdr->delta.lf.multi ? 3 : 0], + segd ? segd->delta_lf_v : 0, mr_deltas); } } diff --git a/chromium/third_party/dav1d/libdav1d/src/lib.c b/chromium/third_party/dav1d/libdav1d/src/lib.c index b21a735964f..396a57c98f4 100644 --- a/chromium/third_party/dav1d/libdav1d/src/lib.c +++ b/chromium/third_party/dav1d/libdav1d/src/lib.c @@ -77,6 +77,7 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) { s->strict_std_compliance = 0; s->output_invisible_frames = 0; s->inloop_filters = DAV1D_INLOOPFILTER_ALL; + s->decode_frame_type = DAV1D_DECODEFRAMETYPE_ALL; } static void close_internal(Dav1dContext **const c_out, int flush); @@ -97,6 +98,37 @@ static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_at return 0; } +static COLD void get_num_threads(Dav1dContext *const c, const Dav1dSettings *const s, + unsigned *n_tc, unsigned *n_fc) +{ + /* ceil(sqrt(n)) */ + static const uint8_t fc_lut[49] = { + 1, /* 1 */ + 2, 2, 2, /* 2- 4 */ + 3, 3, 3, 3, 3, /* 5- 9 */ + 4, 4, 4, 4, 4, 4, 4, /* 10-16 */ + 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 17-25 */ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 26-36 */ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 37-49 */ + }; + *n_tc = s->n_threads ? s->n_threads : + iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS); + *n_fc = s->max_frame_delay ? umin(s->max_frame_delay, *n_tc) : + *n_tc < 50 ? fc_lut[*n_tc - 1] : 8; // min(8, ceil(sqrt(n))) +} + +COLD int dav1d_get_frame_delay(const Dav1dSettings *const s) { + unsigned n_tc, n_fc; + validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->n_threads >= 0 && + s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->max_frame_delay >= 0 && + s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL)); + + get_num_threads(NULL, s, &n_tc, &n_fc); + return n_fc; +} + COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { static pthread_once_t initted = PTHREAD_ONCE_INIT; pthread_once(&initted, init_internal); @@ -113,6 +145,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { DAV1D_ERR(EINVAL)); validate_input_or_ret(s->operating_point >= 0 && s->operating_point <= 31, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->decode_frame_type >= DAV1D_DECODEFRAMETYPE_ALL && + s->decode_frame_type <= DAV1D_DECODEFRAMETYPE_KEY, DAV1D_ERR(EINVAL)); pthread_attr_t thread_attr; if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM); @@ -133,6 +167,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { c->strict_std_compliance = s->strict_std_compliance; c->output_invisible_frames = s->output_invisible_frames; c->inloop_filters = s->inloop_filters; + c->decode_frame_type = s->decode_frame_type; dav1d_data_props_set_defaults(&c->cached_error_props); @@ -171,20 +206,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { c->flush = &c->flush_mem; atomic_init(c->flush, 0); - c->n_tc = s->n_threads ? s->n_threads : - iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS); - /* ceil(sqrt(n)) */ - static const uint8_t fc_lut[49] = { - 1, /* 1 */ - 2, 2, 2, /* 2- 4 */ - 3, 3, 3, 3, 3, /* 5- 9 */ - 4, 4, 4, 4, 4, 4, 4, /* 10-16 */ - 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 17-25 */ - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 26-36 */ - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 37-49 */ - }; - c->n_fc = s->max_frame_delay ? umin(s->max_frame_delay, c->n_tc) : - c->n_tc < 50 ? fc_lut[c->n_tc - 1] : 8; // min(8, ceil(sqrt(n))) + get_num_threads(c, s, &c->n_tc, &c->n_fc); c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32); if (!c->fc) goto error; @@ -217,8 +239,18 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { } for (unsigned n = 0; n < c->n_fc; n++) { Dav1dFrameContext *const f = &c->fc[n]; - if (c->n_tc > 1) - if (pthread_cond_init(&f->task_thread.cond, NULL)) goto error; + if (c->n_tc > 1) { + if (pthread_mutex_init(&f->task_thread.lock, NULL)) goto error; + if (pthread_cond_init(&f->task_thread.cond, NULL)) { + pthread_mutex_destroy(&f->task_thread.lock); + goto error; + } + if (pthread_mutex_init(&f->task_thread.pending_tasks.lock, NULL)) { + pthread_cond_destroy(&f->task_thread.cond); + pthread_mutex_destroy(&f->task_thread.lock); + goto error; + } + } f->c = c; f->task_thread.ttd = &c->task_thread; f->lf.last_sharpness = -1; @@ -317,7 +349,8 @@ static int has_grain(const Dav1dPicture *const pic) { const Dav1dFilmGrainData *fgdata = &pic->frame_hdr->film_grain.data; return fgdata->num_y_points || fgdata->num_uv_points[0] || - fgdata->num_uv_points[1]; + fgdata->num_uv_points[1] || (fgdata->clip_to_restricted_range && + fgdata->chroma_scaling_from_luma); } static int output_image(Dav1dContext *const c, Dav1dPicture *const out) @@ -374,10 +407,13 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) { Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next]; if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) { - if (atomic_load(&c->task_thread.first) + 1U < c->n_fc) + unsigned first = atomic_load(&c->task_thread.first); + if (first + 1U < c->n_fc) atomic_fetch_add(&c->task_thread.first, 1U); else atomic_store(&c->task_thread.first, 0); + atomic_compare_exchange_strong(&c->task_thread.reset_task_cur, + &first, UINT_MAX); if (c->task_thread.cur && c->task_thread.cur < c->n_fc) c->task_thread.cur--; } @@ -529,16 +565,16 @@ error: void dav1d_flush(Dav1dContext *const c) { dav1d_data_unref_internal(&c->in); - if (c->out.p.data[0]) + if (c->out.p.frame_hdr) dav1d_thread_picture_unref(&c->out); - if (c->cache.p.data[0]) + if (c->cache.p.frame_hdr) dav1d_thread_picture_unref(&c->cache); c->drain = 0; c->cached_error = 0; for (int i = 0; i < 8; i++) { - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_ref_dec(&c->refs[i].segmap); dav1d_ref_dec(&c->refs[i].refmvs); @@ -573,6 +609,9 @@ void dav1d_flush(Dav1dContext *const c) { c->fc[i].task_thread.task_head = NULL; c->fc[i].task_thread.task_tail = NULL; c->fc[i].task_thread.task_cur_prev = NULL; + c->fc[i].task_thread.pending_tasks.head = NULL; + c->fc[i].task_thread.pending_tasks.tail = NULL; + atomic_init(&c->fc[i].task_thread.pending_tasks.merge, 0); } atomic_init(&c->task_thread.first, 0); c->task_thread.cur = c->n_fc; @@ -590,7 +629,7 @@ void dav1d_flush(Dav1dContext *const c) { f->n_tile_data = 0; f->task_thread.retval = 0; Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next]; - if (out_delayed->p.data[0]) { + if (out_delayed->p.frame_hdr) { dav1d_thread_picture_unref(out_delayed); } } @@ -646,7 +685,9 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { freep(&f->frame_thread.cbi); } if (c->n_tc > 1) { + pthread_mutex_destroy(&f->task_thread.pending_tasks.lock); pthread_cond_destroy(&f->task_thread.cond); + pthread_mutex_destroy(&f->task_thread.lock); } freep(&f->frame_thread.frame_progress); freep(&f->task_thread.tasks); @@ -667,7 +708,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { dav1d_free_aligned(c->fc); if (c->n_fc > 1 && c->frame_thread.out_delayed) { for (unsigned n = 0; n < c->n_fc; n++) - if (c->frame_thread.out_delayed[n].p.data[0]) + if (c->frame_thread.out_delayed[n].p.frame_hdr) dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]); free(c->frame_thread.out_delayed); } @@ -676,7 +717,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { free(c->tile); for (int n = 0; n < 8; n++) { dav1d_cdf_thread_unref(&c->cdf[n]); - if (c->refs[n].p.p.data[0]) + if (c->refs[n].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[n].p); dav1d_ref_dec(&c->refs[n].refmvs); dav1d_ref_dec(&c->refs[n].segmap); diff --git a/chromium/third_party/dav1d/libdav1d/src/loopfilter.h b/chromium/third_party/dav1d/libdav1d/src/loopfilter.h index c159050b26a..a0f78c96574 100644 --- a/chromium/third_party/dav1d/libdav1d/src/loopfilter.h +++ b/chromium/third_party/dav1d/libdav1d/src/loopfilter.h @@ -53,7 +53,5 @@ typedef struct Dav1dLoopFilterDSPContext { } Dav1dLoopFilterDSPContext; bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c); -bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c); -bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c); #endif /* DAV1D_SRC_LOOPFILTER_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/loopfilter_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/loopfilter_tmpl.c index 6ea744f37bc..cacf2587564 100644 --- a/chromium/third_party/dav1d/libdav1d/src/loopfilter_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/loopfilter_tmpl.c @@ -244,6 +244,14 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride, } } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/loopfilter.h" +#elif ARCH_X86 +#include "src/x86/loopfilter.h" +#endif +#endif + COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) { c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c; c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c; @@ -252,9 +260,9 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_loop_filter_dsp_init_arm)(c); + loop_filter_dsp_init_arm(c); #elif ARCH_X86 - bitfn(dav1d_loop_filter_dsp_init_x86)(c); + loop_filter_dsp_init_x86(c); #endif #endif } diff --git a/chromium/third_party/dav1d/libdav1d/src/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/looprestoration.h index d0ab8110eb8..f55dd319471 100644 --- a/chromium/third_party/dav1d/libdav1d/src/looprestoration.h +++ b/chromium/third_party/dav1d/libdav1d/src/looprestoration.h @@ -75,8 +75,5 @@ typedef struct Dav1dLoopRestorationDSPContext { } Dav1dLoopRestorationDSPContext; bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc); -bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc); -bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c, int bpc); -bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c, int bpc); #endif /* DAV1D_SRC_LOOPRESTORATION_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/looprestoration_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/looprestoration_tmpl.c index 254c25d036f..d4d7867dba5 100644 --- a/chromium/third_party/dav1d/libdav1d/src/looprestoration_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/looprestoration_tmpl.c @@ -524,6 +524,16 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t stride, } } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/looprestoration.h" +#elif ARCH_PPC64LE +#include "src/ppc/looprestoration.h" +#elif ARCH_X86 +#include "src/x86/looprestoration.h" +#endif +#endif + COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, const int bpc) { @@ -534,11 +544,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc); + loop_restoration_dsp_init_arm(c, bpc); #elif ARCH_PPC64LE - bitfn(dav1d_loop_restoration_dsp_init_ppc)(c, bpc); + loop_restoration_dsp_init_ppc(c, bpc); #elif ARCH_X86 - bitfn(dav1d_loop_restoration_dsp_init_x86)(c, bpc); + loop_restoration_dsp_init_x86(c, bpc); #endif #endif } diff --git a/chromium/third_party/dav1d/libdav1d/src/mc.h b/chromium/third_party/dav1d/libdav1d/src/mc.h index 784b58d2218..59ba2d9a5a0 100644 --- a/chromium/third_party/dav1d/libdav1d/src/mc.h +++ b/chromium/third_party/dav1d/libdav1d/src/mc.h @@ -132,7 +132,5 @@ typedef struct Dav1dMCDSPContext { } Dav1dMCDSPContext; bitfn_decls(void dav1d_mc_dsp_init, Dav1dMCDSPContext *c); -bitfn_decls(void dav1d_mc_dsp_init_arm, Dav1dMCDSPContext *c); -bitfn_decls(void dav1d_mc_dsp_init_x86, Dav1dMCDSPContext *c); #endif /* DAV1D_SRC_MC_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/mc_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/mc_tmpl.c index f8d3e3bda83..20226d8a398 100644 --- a/chromium/third_party/dav1d/libdav1d/src/mc_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/mc_tmpl.c @@ -902,6 +902,14 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride, } while (--h); } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/mc.h" +#elif ARCH_X86 +#include "src/x86/mc.h" +#endif +#endif + COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { #define init_mc_fns(type, name) do { \ c->mc [type] = put_##name##_c; \ @@ -937,9 +945,9 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_mc_dsp_init_arm)(c); + mc_dsp_init_arm(c); #elif ARCH_X86 - bitfn(dav1d_mc_dsp_init_x86)(c); + mc_dsp_init_x86(c); #endif #endif } diff --git a/chromium/third_party/dav1d/libdav1d/src/meson.build b/chromium/third_party/dav1d/libdav1d/src/meson.build index a9ce1594dc5..719015496ee 100644 --- a/chromium/third_party/dav1d/libdav1d/src/meson.build +++ b/chromium/third_party/dav1d/libdav1d/src/meson.build @@ -92,16 +92,6 @@ if is_asm_enabled libdav1d_sources += files( 'arm/cpu.c', - 'arm/refmvs_init.c', - ) - libdav1d_tmpl_sources += files( - 'arm/cdef_init_tmpl.c', - 'arm/filmgrain_init_tmpl.c', - 'arm/ipred_init_tmpl.c', - 'arm/itx_init_tmpl.c', - 'arm/loopfilter_init_tmpl.c', - 'arm/looprestoration_init_tmpl.c', - 'arm/mc_init_tmpl.c', ) if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64') @@ -177,18 +167,6 @@ if is_asm_enabled libdav1d_sources += files( 'x86/cpu.c', - 'x86/msac_init.c', - 'x86/refmvs_init.c', - ) - - libdav1d_tmpl_sources += files( - 'x86/cdef_init_tmpl.c', - 'x86/filmgrain_init_tmpl.c', - 'x86/ipred_init_tmpl.c', - 'x86/itx_init_tmpl.c', - 'x86/loopfilter_init_tmpl.c', - 'x86/looprestoration_init_tmpl.c', - 'x86/mc_init_tmpl.c', ) # NASM source files @@ -196,6 +174,7 @@ if is_asm_enabled 'x86/cpuid.asm', 'x86/msac.asm', 'x86/refmvs.asm', + 'x86/itx_avx512.asm', 'x86/cdef_avx2.asm', 'x86/itx_avx2.asm', 'x86/looprestoration_avx2.asm', @@ -208,7 +187,6 @@ if is_asm_enabled 'x86/cdef_avx512.asm', 'x86/filmgrain_avx512.asm', 'x86/ipred_avx512.asm', - 'x86/itx_avx512.asm', 'x86/loopfilter_avx512.asm', 'x86/looprestoration_avx512.asm', 'x86/mc_avx512.asm', @@ -226,8 +204,11 @@ if is_asm_enabled if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( + 'x86/cdef16_avx512.asm', 'x86/filmgrain16_avx512.asm', 'x86/ipred16_avx512.asm', + 'x86/itx16_avx512.asm', + 'x86/loopfilter16_avx512.asm', 'x86/looprestoration16_avx512.asm', 'x86/mc16_avx512.asm', 'x86/cdef16_avx2.asm', @@ -255,8 +236,8 @@ if is_asm_enabled 'ppc/cpu.c', ) libdav1d_arch_tmpl_sources += files( - 'ppc/cdef_init_tmpl.c', - 'ppc/looprestoration_init_tmpl.c', + 'ppc/cdef_tmpl.c', + 'ppc/looprestoration_tmpl.c', ) endif endif diff --git a/chromium/third_party/dav1d/libdav1d/src/msac.c b/chromium/third_party/dav1d/libdav1d/src/msac.c index d5f3207bb0d..43d8ae5d07c 100644 --- a/chromium/third_party/dav1d/libdav1d/src/msac.c +++ b/chromium/third_party/dav1d/libdav1d/src/msac.c @@ -203,6 +203,6 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data, #if ARCH_X86_64 && HAVE_ASM s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c; - dav1d_msac_init_x86(s); + msac_init_x86(s); #endif } diff --git a/chromium/third_party/dav1d/libdav1d/src/obu.c b/chromium/third_party/dav1d/libdav1d/src/obu.c index 7df6850a8c3..b6c2b6990bc 100644 --- a/chromium/third_party/dav1d/libdav1d/src/obu.c +++ b/chromium/third_party/dav1d/libdav1d/src/obu.c @@ -53,6 +53,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, const unsigned init_bit_pos = dav1d_get_bits_pos(gb); #endif + memset(hdr, 0, sizeof(*hdr)); hdr->profile = dav1d_get_bits(gb, 3); if (hdr->profile > 2) goto error; #if DEBUG_SEQ_HDR @@ -60,8 +61,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->still_picture = dav1d_get_bits(gb, 1); - hdr->reduced_still_picture_header = dav1d_get_bits(gb, 1); + hdr->still_picture = dav1d_get_bit(gb); + hdr->reduced_still_picture_header = dav1d_get_bit(gb); if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error; #if DEBUG_SEQ_HDR printf("SEQHDR: post-stillpicture_flags: off=%u\n", @@ -69,22 +70,16 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, #endif if (hdr->reduced_still_picture_header) { - hdr->timing_info_present = 0; - hdr->decoder_model_info_present = 0; - hdr->display_model_info_present = 0; hdr->num_operating_points = 1; - hdr->operating_points[0].idc = 0; hdr->operating_points[0].major_level = dav1d_get_bits(gb, 3); hdr->operating_points[0].minor_level = dav1d_get_bits(gb, 2); - hdr->operating_points[0].tier = 0; - hdr->operating_points[0].decoder_model_param_present = 0; - hdr->operating_points[0].display_model_param_present = 0; + hdr->operating_points[0].initial_display_delay = 10; } else { - hdr->timing_info_present = dav1d_get_bits(gb, 1); + hdr->timing_info_present = dav1d_get_bit(gb); if (hdr->timing_info_present) { hdr->num_units_in_tick = dav1d_get_bits(gb, 32); hdr->time_scale = dav1d_get_bits(gb, 32); - hdr->equal_picture_interval = dav1d_get_bits(gb, 1); + hdr->equal_picture_interval = dav1d_get_bit(gb); if (hdr->equal_picture_interval) { const unsigned num_ticks_per_picture = dav1d_get_vlc(gb); if (num_ticks_per_picture == 0xFFFFFFFFU) @@ -92,22 +87,20 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, hdr->num_ticks_per_picture = num_ticks_per_picture + 1; } - hdr->decoder_model_info_present = dav1d_get_bits(gb, 1); + hdr->decoder_model_info_present = dav1d_get_bit(gb); if (hdr->decoder_model_info_present) { hdr->encoder_decoder_buffer_delay_length = dav1d_get_bits(gb, 5) + 1; hdr->num_units_in_decoding_tick = dav1d_get_bits(gb, 32); hdr->buffer_removal_delay_length = dav1d_get_bits(gb, 5) + 1; hdr->frame_presentation_delay_length = dav1d_get_bits(gb, 5) + 1; } - } else { - hdr->decoder_model_info_present = 0; } #if DEBUG_SEQ_HDR printf("SEQHDR: post-timinginfo: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->display_model_info_present = dav1d_get_bits(gb, 1); + hdr->display_model_info_present = dav1d_get_bit(gb); hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1; for (int i = 0; i < hdr->num_operating_points; i++) { struct Dav1dSequenceHeaderOperatingPoint *const op = @@ -117,23 +110,24 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, goto error; op->major_level = 2 + dav1d_get_bits(gb, 3); op->minor_level = dav1d_get_bits(gb, 2); - op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0; - op->decoder_model_param_present = - hdr->decoder_model_info_present && dav1d_get_bits(gb, 1); - if (op->decoder_model_param_present) { - struct Dav1dSequenceHeaderOperatingParameterInfo *const opi = - &hdr->operating_parameter_info[i]; - opi->decoder_buffer_delay = - dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); - opi->encoder_buffer_delay = - dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); - opi->low_delay_mode = dav1d_get_bits(gb, 1); - } - op->display_model_param_present = - hdr->display_model_info_present && dav1d_get_bits(gb, 1); - if (op->display_model_param_present) { - op->initial_display_delay = dav1d_get_bits(gb, 4) + 1; + if (op->major_level > 3) + op->tier = dav1d_get_bit(gb); + if (hdr->decoder_model_info_present) { + op->decoder_model_param_present = dav1d_get_bit(gb); + if (op->decoder_model_param_present) { + struct Dav1dSequenceHeaderOperatingParameterInfo *const opi = + &hdr->operating_parameter_info[i]; + opi->decoder_buffer_delay = + dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); + opi->encoder_buffer_delay = + dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); + opi->low_delay_mode = dav1d_get_bit(gb); + } } + if (hdr->display_model_info_present) + op->display_model_param_present = dav1d_get_bit(gb); + op->initial_display_delay = + op->display_model_param_present ? dav1d_get_bits(gb, 4) + 1 : 10; } #if DEBUG_SEQ_HDR printf("SEQHDR: post-operating-points: off=%u\n", @@ -155,67 +149,58 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, printf("SEQHDR: post-size: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->frame_id_numbers_present = - hdr->reduced_still_picture_header ? 0 : dav1d_get_bits(gb, 1); - if (hdr->frame_id_numbers_present) { - hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2; - hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1; + if (!hdr->reduced_still_picture_header) { + hdr->frame_id_numbers_present = dav1d_get_bit(gb); + if (hdr->frame_id_numbers_present) { + hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2; + hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1; + } } #if DEBUG_SEQ_HDR printf("SEQHDR: post-frame-id-numbers-present: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->sb128 = dav1d_get_bits(gb, 1); - hdr->filter_intra = dav1d_get_bits(gb, 1); - hdr->intra_edge_filter = dav1d_get_bits(gb, 1); + hdr->sb128 = dav1d_get_bit(gb); + hdr->filter_intra = dav1d_get_bit(gb); + hdr->intra_edge_filter = dav1d_get_bit(gb); if (hdr->reduced_still_picture_header) { - hdr->inter_intra = 0; - hdr->masked_compound = 0; - hdr->warped_motion = 0; - hdr->dual_filter = 0; - hdr->order_hint = 0; - hdr->jnt_comp = 0; - hdr->ref_frame_mvs = 0; - hdr->order_hint_n_bits = 0; hdr->screen_content_tools = DAV1D_ADAPTIVE; hdr->force_integer_mv = DAV1D_ADAPTIVE; } else { - hdr->inter_intra = dav1d_get_bits(gb, 1); - hdr->masked_compound = dav1d_get_bits(gb, 1); - hdr->warped_motion = dav1d_get_bits(gb, 1); - hdr->dual_filter = dav1d_get_bits(gb, 1); - hdr->order_hint = dav1d_get_bits(gb, 1); + hdr->inter_intra = dav1d_get_bit(gb); + hdr->masked_compound = dav1d_get_bit(gb); + hdr->warped_motion = dav1d_get_bit(gb); + hdr->dual_filter = dav1d_get_bit(gb); + hdr->order_hint = dav1d_get_bit(gb); if (hdr->order_hint) { - hdr->jnt_comp = dav1d_get_bits(gb, 1); - hdr->ref_frame_mvs = dav1d_get_bits(gb, 1); - } else { - hdr->jnt_comp = 0; - hdr->ref_frame_mvs = 0; - hdr->order_hint_n_bits = 0; + hdr->jnt_comp = dav1d_get_bit(gb); + hdr->ref_frame_mvs = dav1d_get_bit(gb); } - hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1); + hdr->screen_content_tools = dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-screentools: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif hdr->force_integer_mv = hdr->screen_content_tools ? - dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1) : 2; + dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb) : 2; if (hdr->order_hint) hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1; } - hdr->super_res = dav1d_get_bits(gb, 1); - hdr->cdef = dav1d_get_bits(gb, 1); - hdr->restoration = dav1d_get_bits(gb, 1); + hdr->super_res = dav1d_get_bit(gb); + hdr->cdef = dav1d_get_bit(gb); + hdr->restoration = dav1d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-featurebits: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->hbd = dav1d_get_bits(gb, 1); - if (hdr->profile == 2 && hdr->hbd) hdr->hbd += dav1d_get_bits(gb, 1); - hdr->monochrome = hdr->profile != 1 ? dav1d_get_bits(gb, 1) : 0; - hdr->color_description_present = dav1d_get_bits(gb, 1); + hdr->hbd = dav1d_get_bit(gb); + if (hdr->profile == 2 && hdr->hbd) + hdr->hbd += dav1d_get_bit(gb); + if (hdr->profile != 1) + hdr->monochrome = dav1d_get_bit(gb); + hdr->color_description_present = dav1d_get_bit(gb); if (hdr->color_description_present) { hdr->pri = dav1d_get_bits(gb, 8); hdr->trc = dav1d_get_bits(gb, 8); @@ -226,44 +211,40 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, hdr->mtrx = DAV1D_MC_UNKNOWN; } if (hdr->monochrome) { - hdr->color_range = dav1d_get_bits(gb, 1); + hdr->color_range = dav1d_get_bit(gb); hdr->layout = DAV1D_PIXEL_LAYOUT_I400; hdr->ss_hor = hdr->ss_ver = 1; hdr->chr = DAV1D_CHR_UNKNOWN; - hdr->separate_uv_delta_q = 0; } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 && hdr->trc == DAV1D_TRC_SRGB && hdr->mtrx == DAV1D_MC_IDENTITY) { hdr->layout = DAV1D_PIXEL_LAYOUT_I444; - hdr->ss_hor = hdr->ss_ver = 0; hdr->color_range = 1; if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2)) goto error; } else { - hdr->color_range = dav1d_get_bits(gb, 1); + hdr->color_range = dav1d_get_bit(gb); switch (hdr->profile) { case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420; hdr->ss_hor = hdr->ss_ver = 1; break; case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444; - hdr->ss_hor = hdr->ss_ver = 0; break; case 2: if (hdr->hbd == 2) { - hdr->ss_hor = dav1d_get_bits(gb, 1); - hdr->ss_ver = hdr->ss_hor && dav1d_get_bits(gb, 1); - } else { + hdr->ss_hor = dav1d_get_bit(gb); + if (hdr->ss_hor) + hdr->ss_ver = dav1d_get_bit(gb); + } else hdr->ss_hor = 1; - hdr->ss_ver = 0; - } hdr->layout = hdr->ss_hor ? hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 : DAV1D_PIXEL_LAYOUT_I422 : DAV1D_PIXEL_LAYOUT_I444; break; } - hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ? + hdr->chr = (hdr->ss_hor & hdr->ss_ver) ? dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN; } if (c->strict_std_compliance && @@ -271,19 +252,20 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, { goto error; } - hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1); + if (!hdr->monochrome) + hdr->separate_uv_delta_q = dav1d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-colorinfo: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->film_grain_present = dav1d_get_bits(gb, 1); + hdr->film_grain_present = dav1d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-filmgrain: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - dav1d_get_bits(gb, 1); // dummy bit + dav1d_get_bit(gb); // dummy bit // We needn't bother flushing the OBU here: we'll check we didn't // overrun in the caller and will then discard gb, so there's no @@ -304,15 +286,15 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb, if (use_ref) { for (int i = 0; i < 7; i++) { - if (dav1d_get_bits(gb, 1)) { + if (dav1d_get_bit(gb)) { const Dav1dThreadPicture *const ref = &c->refs[c->frame_hdr->refidx[i]].p; - if (!ref->p.data[0]) return -1; - hdr->width[1] = ref->p.p.w; - hdr->height = ref->p.p.h; + if (!ref->p.frame_hdr) return -1; + hdr->width[1] = ref->p.frame_hdr->width[1]; + hdr->height = ref->p.frame_hdr->height; hdr->render_width = ref->p.frame_hdr->render_width; hdr->render_height = ref->p.frame_hdr->render_height; - hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1); + hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb); if (hdr->super_res.enabled) { const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3); @@ -334,7 +316,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb, hdr->width[1] = seqhdr->max_width; hdr->height = seqhdr->max_height; } - hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1); + hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb); if (hdr->super_res.enabled) { const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3); hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, imin(16, hdr->width[1])); @@ -342,7 +324,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb, hdr->super_res.width_scale_denominator = 8; hdr->width[0] = hdr->width[1]; } - hdr->have_render_size = dav1d_get_bits(gb, 1); + hdr->have_render_size = dav1d_get_bit(gb); if (hdr->have_render_size) { hdr->render_width = dav1d_get_bits(gb, 16) + 1; hdr->render_height = dav1d_get_bits(gb, 16) + 1; @@ -374,7 +356,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { Dav1dFrameHeader *const hdr = c->frame_hdr; hdr->show_existing_frame = - !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1); + !seqhdr->reduced_still_picture_header && dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-show_existing_frame: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -392,26 +374,27 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } hdr->frame_type = seqhdr->reduced_still_picture_header ? DAV1D_FRAME_TYPE_KEY : dav1d_get_bits(gb, 2); - hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1); + hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bit(gb); if (hdr->show_frame) { if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval) hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length); + hdr->showable_frame = hdr->frame_type != DAV1D_FRAME_TYPE_KEY; } else - hdr->showable_frame = dav1d_get_bits(gb, 1); + hdr->showable_frame = dav1d_get_bit(gb); hdr->error_resilient_mode = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY && hdr->show_frame) || hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH || - seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1); + seqhdr->reduced_still_picture_header || dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-frametype_bits: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->disable_cdf_update = dav1d_get_bits(gb, 1); + hdr->disable_cdf_update = dav1d_get_bit(gb); hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV1D_ADAPTIVE ? - dav1d_get_bits(gb, 1) : seqhdr->screen_content_tools; + dav1d_get_bit(gb) : seqhdr->screen_content_tools; if (hdr->allow_screen_content_tools) hdr->force_integer_mv = seqhdr->force_integer_mv == DAV1D_ADAPTIVE ? - dav1d_get_bits(gb, 1) : seqhdr->force_integer_mv; + dav1d_get_bit(gb) : seqhdr->force_integer_mv; else hdr->force_integer_mv = 0; @@ -422,7 +405,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits); hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 : - hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bits(gb, 1); + hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-frame_size_override_flag: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -433,7 +416,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE; if (seqhdr->decoder_model_info_present) { - hdr->buffer_removal_time_present = dav1d_get_bits(gb, 1); + hdr->buffer_removal_time_present = dav1d_get_bit(gb); if (hdr->buffer_removal_time_present) { for (int i = 0; i < c->seq_hdr->num_operating_points; i++) { const struct Dav1dSequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i]; @@ -454,9 +437,14 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint) for (int i = 0; i < 8; i++) dav1d_get_bits(gb, seqhdr->order_hint_n_bits); + if (c->strict_std_compliance && + hdr->frame_type == DAV1D_FRAME_TYPE_INTRA && hdr->refresh_frame_flags == 0xff) + { + goto error; + } if (read_frame_size(c, gb, 0) < 0) goto error; hdr->allow_intrabc = hdr->allow_screen_content_tools && - !hdr->super_res.enabled && dav1d_get_bits(gb, 1); + !hdr->super_res.enabled && dav1d_get_bit(gb); hdr->use_ref_frame_mvs = 0; } else { hdr->allow_intrabc = 0; @@ -466,7 +454,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { for (int i = 0; i < 8; i++) dav1d_get_bits(gb, seqhdr->order_hint_n_bits); hdr->frame_ref_short_signaling = - seqhdr->order_hint && dav1d_get_bits(gb, 1); + seqhdr->order_hint && dav1d_get_bit(gb); if (hdr->frame_ref_short_signaling) { // FIXME: Nearly verbatim copy from section 7.8 hdr->refidx[0] = dav1d_get_bits(gb, 3); hdr->refidx[1] = hdr->refidx[2] = -1; @@ -570,13 +558,13 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { const int use_ref = !hdr->error_resilient_mode && hdr->frame_size_override; if (read_frame_size(c, gb, use_ref) < 0) goto error; - hdr->hp = !hdr->force_integer_mv && dav1d_get_bits(gb, 1); - hdr->subpel_filter_mode = dav1d_get_bits(gb, 1) ? DAV1D_FILTER_SWITCHABLE : + hdr->hp = !hdr->force_integer_mv && dav1d_get_bit(gb); + hdr->subpel_filter_mode = dav1d_get_bit(gb) ? DAV1D_FILTER_SWITCHABLE : dav1d_get_bits(gb, 2); - hdr->switchable_motion_mode = dav1d_get_bits(gb, 1); + hdr->switchable_motion_mode = dav1d_get_bit(gb); hdr->use_ref_frame_mvs = !hdr->error_resilient_mode && seqhdr->ref_frame_mvs && seqhdr->order_hint && - IS_INTER_OR_SWITCH(hdr) && dav1d_get_bits(gb, 1); + IS_INTER_OR_SWITCH(hdr) && dav1d_get_bit(gb); } #if DEBUG_FRAME_HDR printf("HDR: post-frametype-specific-bits: off=%td\n", @@ -584,14 +572,14 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { #endif hdr->refresh_context = !seqhdr->reduced_still_picture_header && - !hdr->disable_cdf_update && !dav1d_get_bits(gb, 1); + !hdr->disable_cdf_update && !dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-refresh_context: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif // tile data - hdr->tiling.uniform = dav1d_get_bits(gb, 1); + hdr->tiling.uniform = dav1d_get_bit(gb); const int sbsz_min1 = (64 << seqhdr->sb128) - 1; const int sbsz_log2 = 6 + seqhdr->sb128; const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2; @@ -605,7 +593,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->tiling.min_log2_cols); if (hdr->tiling.uniform) { for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols; - hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bits(gb, 1); + hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bit(gb); hdr->tiling.log2_cols++) ; const int tile_w = 1 + ((sbw - 1) >> hdr->tiling.log2_cols); hdr->tiling.cols = 0; @@ -615,7 +603,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { imax(min_log2_tiles - hdr->tiling.log2_cols, 0); for (hdr->tiling.log2_rows = hdr->tiling.min_log2_rows; - hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bits(gb, 1); + hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bit(gb); hdr->tiling.log2_rows++) ; const int tile_h = 1 + ((sbh - 1) >> hdr->tiling.log2_rows); hdr->tiling.rows = 0; @@ -666,17 +654,17 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { // quant data hdr->quant.yac = dav1d_get_bits(gb, 8); - hdr->quant.ydc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + hdr->quant.ydc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; if (!seqhdr->monochrome) { // If the sequence header says that delta_q might be different // for U, V, we must check whether it actually is for this // frame. - const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0; - hdr->quant.udc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; - hdr->quant.uac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bit(gb) : 0; + hdr->quant.udc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; + hdr->quant.uac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; if (diff_uv_delta) { - hdr->quant.vdc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; - hdr->quant.vac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + hdr->quant.vdc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; + hdr->quant.vac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; } else { hdr->quant.vdc_delta = hdr->quant.udc_delta; hdr->quant.vac_delta = hdr->quant.uac_delta; @@ -686,7 +674,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { printf("HDR: post-quant: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->quant.qm = dav1d_get_bits(gb, 1); + hdr->quant.qm = dav1d_get_bit(gb); if (hdr->quant.qm) { hdr->quant.qm_y = dav1d_get_bits(gb, 4); hdr->quant.qm_u = dav1d_get_bits(gb, 4); @@ -700,17 +688,17 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { #endif // segmentation data - hdr->segmentation.enabled = dav1d_get_bits(gb, 1); + hdr->segmentation.enabled = dav1d_get_bit(gb); if (hdr->segmentation.enabled) { if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) { hdr->segmentation.update_map = 1; hdr->segmentation.temporal = 0; hdr->segmentation.update_data = 1; } else { - hdr->segmentation.update_map = dav1d_get_bits(gb, 1); + hdr->segmentation.update_map = dav1d_get_bit(gb); hdr->segmentation.temporal = - hdr->segmentation.update_map ? dav1d_get_bits(gb, 1) : 0; - hdr->segmentation.update_data = dav1d_get_bits(gb, 1); + hdr->segmentation.update_map ? dav1d_get_bit(gb) : 0; + hdr->segmentation.update_data = dav1d_get_bit(gb); } if (hdr->segmentation.update_data) { @@ -719,48 +707,48 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) { Dav1dSegmentationData *const seg = &hdr->segmentation.seg_data.d[i]; - if (dav1d_get_bits(gb, 1)) { - seg->delta_q = dav1d_get_sbits(gb, 8); + if (dav1d_get_bit(gb)) { + seg->delta_q = dav1d_get_sbits(gb, 9); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_q = 0; } - if (dav1d_get_bits(gb, 1)) { - seg->delta_lf_y_v = dav1d_get_sbits(gb, 6); + if (dav1d_get_bit(gb)) { + seg->delta_lf_y_v = dav1d_get_sbits(gb, 7); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_lf_y_v = 0; } - if (dav1d_get_bits(gb, 1)) { - seg->delta_lf_y_h = dav1d_get_sbits(gb, 6); + if (dav1d_get_bit(gb)) { + seg->delta_lf_y_h = dav1d_get_sbits(gb, 7); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_lf_y_h = 0; } - if (dav1d_get_bits(gb, 1)) { - seg->delta_lf_u = dav1d_get_sbits(gb, 6); + if (dav1d_get_bit(gb)) { + seg->delta_lf_u = dav1d_get_sbits(gb, 7); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_lf_u = 0; } - if (dav1d_get_bits(gb, 1)) { - seg->delta_lf_v = dav1d_get_sbits(gb, 6); + if (dav1d_get_bit(gb)) { + seg->delta_lf_v = dav1d_get_sbits(gb, 7); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_lf_v = 0; } - if (dav1d_get_bits(gb, 1)) { + if (dav1d_get_bit(gb)) { seg->ref = dav1d_get_bits(gb, 3); hdr->segmentation.seg_data.last_active_segid = i; hdr->segmentation.seg_data.preskip = 1; } else { seg->ref = -1; } - if ((seg->skip = dav1d_get_bits(gb, 1))) { + if ((seg->skip = dav1d_get_bit(gb))) { hdr->segmentation.seg_data.last_active_segid = i; hdr->segmentation.seg_data.preskip = 1; } - if ((seg->globalmv = dav1d_get_bits(gb, 1))) { + if ((seg->globalmv = dav1d_get_bit(gb))) { hdr->segmentation.seg_data.last_active_segid = i; hdr->segmentation.seg_data.preskip = 1; } @@ -785,12 +773,12 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { #endif // delta q - hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bits(gb, 1) : 0; + hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bit(gb) : 0; hdr->delta.q.res_log2 = hdr->delta.q.present ? dav1d_get_bits(gb, 2) : 0; hdr->delta.lf.present = hdr->delta.q.present && !hdr->allow_intrabc && - dav1d_get_bits(gb, 1); + dav1d_get_bit(gb); hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0; - hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bits(gb, 1) : 0; + hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bit(gb) : 0; #if DEBUG_FRAME_HDR printf("HDR: post-delta_q_lf_flags: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -836,18 +824,18 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->loopfilter.mode_ref_deltas = c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas; } - hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bits(gb, 1); + hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bit(gb); if (hdr->loopfilter.mode_ref_delta_enabled) { - hdr->loopfilter.mode_ref_delta_update = dav1d_get_bits(gb, 1); + hdr->loopfilter.mode_ref_delta_update = dav1d_get_bit(gb); if (hdr->loopfilter.mode_ref_delta_update) { for (int i = 0; i < 8; i++) - if (dav1d_get_bits(gb, 1)) + if (dav1d_get_bit(gb)) hdr->loopfilter.mode_ref_deltas.ref_delta[i] = - dav1d_get_sbits(gb, 6); + dav1d_get_sbits(gb, 7); for (int i = 0; i < 2; i++) - if (dav1d_get_bits(gb, 1)) + if (dav1d_get_bit(gb)) hdr->loopfilter.mode_ref_deltas.mode_delta[i] = - dav1d_get_sbits(gb, 6); + dav1d_get_sbits(gb, 7); } } } @@ -893,16 +881,16 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { { // Log2 of the restoration unit size. hdr->restoration.unit_size[0] = 6 + seqhdr->sb128; - if (dav1d_get_bits(gb, 1)) { + if (dav1d_get_bit(gb)) { hdr->restoration.unit_size[0]++; if (!seqhdr->sb128) - hdr->restoration.unit_size[0] += dav1d_get_bits(gb, 1); + hdr->restoration.unit_size[0] += dav1d_get_bit(gb); } hdr->restoration.unit_size[1] = hdr->restoration.unit_size[0]; if ((hdr->restoration.type[1] || hdr->restoration.type[2]) && seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1) { - hdr->restoration.unit_size[1] -= dav1d_get_bits(gb, 1); + hdr->restoration.unit_size[1] -= dav1d_get_bit(gb); } } else { hdr->restoration.unit_size[0] = 8; @@ -918,12 +906,12 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { #endif hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY : - dav1d_get_bits(gb, 1) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST; + dav1d_get_bit(gb) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST; #if DEBUG_FRAME_HDR printf("HDR: post-txfmmode: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bits(gb, 1) : 0; + hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bit(gb) : 0; #if DEBUG_FRAME_HDR printf("HDR: post-refmode: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -935,7 +923,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { int off_after = -1; int off_before_idx, off_after_idx; for (int i = 0; i < 7; i++) { - if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error; + if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error; const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc); @@ -963,7 +951,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { unsigned off_before2 = 0xFFFFFFFFU; int off_before2_idx; for (int i = 0; i < 7; i++) { - if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error; + if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error; const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; if (get_poc_diff(seqhdr->order_hint_n_bits, refpoc, off_before) < 0) { @@ -984,18 +972,18 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } } } - hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0; + hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bit(gb) : 0; #if DEBUG_FRAME_HDR printf("HDR: post-extskip: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->warp_motion = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) && - seqhdr->warped_motion && dav1d_get_bits(gb, 1); + seqhdr->warped_motion && dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-warpmotionbit: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->reduced_txtp_set = dav1d_get_bits(gb, 1); + hdr->reduced_txtp_set = dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-reducedtxtpset: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -1006,9 +994,9 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { if (IS_INTER_OR_SWITCH(hdr)) { for (int i = 0; i < 7; i++) { - hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY : - dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM : - dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_TRANSLATION : + hdr->gmv[i].type = !dav1d_get_bit(gb) ? DAV1D_WM_TYPE_IDENTITY : + dav1d_get_bit(gb) ? DAV1D_WM_TYPE_ROT_ZOOM : + dav1d_get_bit(gb) ? DAV1D_WM_TYPE_TRANSLATION : DAV1D_WM_TYPE_AFFINE; if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue; @@ -1057,10 +1045,10 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->film_grain.present = seqhdr->film_grain_present && (hdr->show_frame || hdr->showable_frame) && - dav1d_get_bits(gb, 1); + dav1d_get_bit(gb); if (hdr->film_grain.present) { const unsigned seed = dav1d_get_bits(gb, 16); - hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bits(gb, 1); + hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bit(gb); if (!hdr->film_grain.update) { const int refidx = dav1d_get_bits(gb, 3); int i; @@ -1084,7 +1072,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } fgd->chroma_scaling_from_luma = - !seqhdr->monochrome && dav1d_get_bits(gb, 1); + !seqhdr->monochrome && dav1d_get_bit(gb); if (seqhdr->monochrome || fgd->chroma_scaling_from_luma || (seqhdr->ss_ver == 1 && seqhdr->ss_hor == 1 && !fgd->num_y_points)) { @@ -1128,8 +1116,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8) - 128; fgd->uv_offset[pl] = dav1d_get_bits(gb, 9) - 256; } - fgd->overlap_flag = dav1d_get_bits(gb, 1); - fgd->clip_to_restricted_range = dav1d_get_bits(gb, 1); + fgd->overlap_flag = dav1d_get_bit(gb); + fgd->clip_to_restricted_range = dav1d_get_bit(gb); } } else { memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data)); @@ -1148,7 +1136,7 @@ error: static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) { const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows; - const int have_tile_pos = n_tiles > 1 ? dav1d_get_bits(gb, 1) : 0; + const int have_tile_pos = n_tiles > 1 ? dav1d_get_bit(gb) : 0; if (have_tile_pos) { const int n_bits = c->frame_hdr->tiling.log2_cols + @@ -1194,11 +1182,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa dav1d_init_get_bits(&gb, in->data, in->sz); // obu header - dav1d_get_bits(&gb, 1); // obu_forbidden_bit + dav1d_get_bit(&gb); // obu_forbidden_bit const enum Dav1dObuType type = dav1d_get_bits(&gb, 4); - const int has_extension = dav1d_get_bits(&gb, 1); - const int has_length_field = dav1d_get_bits(&gb, 1); - dav1d_get_bits(&gb, 1); // reserved + const int has_extension = dav1d_get_bit(&gb); + const int has_length_field = dav1d_get_bit(&gb); + dav1d_get_bit(&gb); // reserved int temporal_id = 0, spatial_id = 0; if (has_extension) { @@ -1245,7 +1233,6 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa sizeof(Dav1dSequenceHeader)); if (!ref) return DAV1D_ERR(ENOMEM); Dav1dSequenceHeader *seq_hdr = ref->data; - memset(seq_hdr, 0, sizeof(*seq_hdr)); if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) { dav1d_ref_dec(&ref); goto error; @@ -1270,7 +1257,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa dav1d_ref_dec(&c->mastering_display_ref); dav1d_ref_dec(&c->content_light_ref); for (int i = 0; i < 8; i++) { - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_ref_dec(&c->refs[i].segmap); dav1d_ref_dec(&c->refs[i].refmvs); @@ -1319,7 +1306,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa if (type != DAV1D_OBU_FRAME) { // This is actually a frame header OBU so read the // trailing bit and check for overrun. - dav1d_get_bits(&gb, 1); + dav1d_get_bit(&gb); if (check_for_overrun(c, &gb, init_bit_pos, len)) { c->frame_hdr = NULL; goto error; @@ -1419,7 +1406,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa #endif // Skip the trailing bit, align to the next byte boundary and check for overrun. - dav1d_get_bits(&gb, 1); + dav1d_get_bit(&gb); dav1d_bytealign_get_bits(&gb); if (check_for_overrun(c, &gb, init_bit_pos, len)) { dav1d_ref_dec(&ref); @@ -1471,7 +1458,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif // Skip the trailing bit, align to the next byte boundary and check for overrun. - dav1d_get_bits(&gb, 1); + dav1d_get_bit(&gb); dav1d_bytealign_get_bits(&gb); if (check_for_overrun(c, &gb, init_bit_pos, len)) { dav1d_ref_dec(&ref); @@ -1503,7 +1490,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa if (payload_size <= 0) { dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n"); - goto error; + break; } Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t)); @@ -1550,7 +1537,26 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa if (c->seq_hdr && c->frame_hdr) { if (c->frame_hdr->show_existing_frame) { + if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr) goto error; + switch (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type) { + case DAV1D_FRAME_TYPE_INTER: + case DAV1D_FRAME_TYPE_SWITCH: + if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE) + goto skip; + break; + case DAV1D_FRAME_TYPE_INTRA: + if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA) + goto skip; + // fall-through + default: + break; + } if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error; + if (c->strict_std_compliance && + !c->refs[c->frame_hdr->existing_frame_idx].p.showable) + { + goto error; + } if (c->n_fc == 1) { dav1d_thread_picture_ref(&c->out, &c->refs[c->frame_hdr->existing_frame_idx].p); @@ -1570,10 +1576,13 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next]; if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) { - if (atomic_load(&c->task_thread.first) + 1U < c->n_fc) + unsigned first = atomic_load(&c->task_thread.first); + if (first + 1U < c->n_fc) atomic_fetch_add(&c->task_thread.first, 1U); else atomic_store(&c->task_thread.first, 0); + atomic_compare_exchange_strong(&c->task_thread.reset_task_cur, + &first, UINT_MAX); if (c->task_thread.cur && c->task_thread.cur < c->n_fc) c->task_thread.cur--; } @@ -1602,10 +1611,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa } if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) { const int r = c->frame_hdr->existing_frame_idx; + c->refs[r].p.showable = 0; for (int i = 0; i < 8; i++) { if (i == r) continue; - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p); @@ -1621,6 +1631,23 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa } c->frame_hdr = NULL; } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) { + switch (c->frame_hdr->frame_type) { + case DAV1D_FRAME_TYPE_INTER: + case DAV1D_FRAME_TYPE_SWITCH: + if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE || + (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE && + !c->frame_hdr->refresh_frame_flags)) + goto skip; + break; + case DAV1D_FRAME_TYPE_INTRA: + if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA || + (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE && + !c->frame_hdr->refresh_frame_flags)) + goto skip; + // fall-through + default: + break; + } if (!c->n_tile_data) goto error; if ((res = dav1d_submit_frame(c)) < 0) @@ -1633,6 +1660,26 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa return len + init_byte_pos; +skip: + // update refs with only the headers in case we skip the frame + for (int i = 0; i < 8; i++) { + if (c->frame_hdr->refresh_frame_flags & (1 << i)) { + dav1d_thread_picture_unref(&c->refs[i].p); + c->refs[i].p.p.frame_hdr = c->frame_hdr; + c->refs[i].p.p.seq_hdr = c->seq_hdr; + c->refs[i].p.p.frame_hdr_ref = c->frame_hdr_ref; + c->refs[i].p.p.seq_hdr_ref = c->seq_hdr_ref; + dav1d_ref_inc(c->frame_hdr_ref); + dav1d_ref_inc(c->seq_hdr_ref); + } + } + + dav1d_ref_dec(&c->frame_hdr_ref); + c->frame_hdr = NULL; + c->n_tiles = 0; + + return len + init_byte_pos; + error: dav1d_data_props_copy(&c->cached_error_props, &in->m); dav1d_log(c, "Error parsing OBU data\n"); diff --git a/chromium/third_party/dav1d/libdav1d/src/picture.c b/chromium/third_party/dav1d/libdav1d/src/picture.c index bebc4dd9c17..58ebd824d68 100644 --- a/chromium/third_party/dav1d/libdav1d/src/picture.c +++ b/chromium/third_party/dav1d/libdav1d/src/picture.c @@ -194,10 +194,15 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f dav1d_ref_dec(&c->itut_t35_ref); c->itut_t35 = NULL; + // Don't clear these flags from c->frame_flags if the frame is not visible. + // This way they will be added to the next visible frame too. + const int flags_mask = (f->frame_hdr->show_frame || c->output_invisible_frames) + ? 0 : (PICTURE_FLAG_NEW_SEQUENCE | PICTURE_FLAG_NEW_OP_PARAMS_INFO); p->flags = c->frame_flags; - c->frame_flags = 0; + c->frame_flags &= flags_mask; p->visible = f->frame_hdr->show_frame; + p->showable = f->frame_hdr->showable_frame; if (have_frame_mt) { atomic_init(&p->progress[0], 0); atomic_init(&p->progress[1], 0); @@ -228,13 +233,13 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) { if (src->ref) { validate_input(src->data[0] != NULL); dav1d_ref_inc(src->ref); - if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref); - if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref); - if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref); - if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref); - if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref); - if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref); } + if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref); + if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref); + if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref); + if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref); + if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref); + if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref); *dst = *src; } @@ -255,6 +260,7 @@ void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst, { dav1d_picture_ref(&dst->p, &src->p); dst->visible = src->visible; + dst->showable = src->showable; dst->progress = src->progress; dst->flags = src->flags; } @@ -264,6 +270,7 @@ void dav1d_thread_picture_move_ref(Dav1dThreadPicture *const dst, { dav1d_picture_move_ref(&dst->p, &src->p); dst->visible = src->visible; + dst->showable = src->showable; dst->progress = src->progress; dst->flags = src->flags; memset(src, 0, sizeof(*src)); @@ -275,13 +282,13 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) { if (p->ref) { validate_input(p->data[0] != NULL); dav1d_ref_dec(&p->ref); - dav1d_ref_dec(&p->seq_hdr_ref); - dav1d_ref_dec(&p->frame_hdr_ref); - dav1d_ref_dec(&p->m.user_data.ref); - dav1d_ref_dec(&p->content_light_ref); - dav1d_ref_dec(&p->mastering_display_ref); - dav1d_ref_dec(&p->itut_t35_ref); } + dav1d_ref_dec(&p->seq_hdr_ref); + dav1d_ref_dec(&p->frame_hdr_ref); + dav1d_ref_dec(&p->m.user_data.ref); + dav1d_ref_dec(&p->content_light_ref); + dav1d_ref_dec(&p->mastering_display_ref); + dav1d_ref_dec(&p->itut_t35_ref); memset(p, 0, sizeof(*p)); dav1d_data_props_set_defaults(&p->m); } diff --git a/chromium/third_party/dav1d/libdav1d/src/picture.h b/chromium/third_party/dav1d/libdav1d/src/picture.h index 0e30d48eb86..154c85a0c6a 100644 --- a/chromium/third_party/dav1d/libdav1d/src/picture.h +++ b/chromium/third_party/dav1d/libdav1d/src/picture.h @@ -52,6 +52,10 @@ enum PictureFlags { typedef struct Dav1dThreadPicture { Dav1dPicture p; int visible; + // This can be set for inter frames, non-key intra frames, or for invisible + // keyframes that have not yet been made visible using the show-existing-frame + // mechanism. + int showable; enum PictureFlags flags; // [0] block data (including segmentation map and motion vectors) // [1] pixel data diff --git a/chromium/third_party/dav1d/libdav1d/src/ppc/cdef.h b/chromium/third_party/dav1d/libdav1d/src/ppc/cdef.h new file mode 100644 index 00000000000..b794ba53bef --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/ppc/cdef.h @@ -0,0 +1,61 @@ +/* + * Copyright © 2019, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdlib.h> + +#include "common/bitdepth.h" +#include "common/intops.h" + +#include "src/cdef.h" +#include "src/cpu.h" + +#define cdef_vsx_fn(w, h) \ +void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ + const ptrdiff_t dst_stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const pixel *const bottom, \ + const int pri_strength, \ + const int sec_strength, \ + const int dir, \ + const int damping, \ + const enum CdefEdgeFlags edges) + +cdef_vsx_fn(4, 4); +cdef_vsx_fn(4, 8); +cdef_vsx_fn(8, 8); + +static ALWAYS_INLINE void cdef_dsp_init_ppc(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return; + +#if BITDEPTH == 8 + c->fb[0] = dav1d_cdef_filter_8x8_vsx; + c->fb[1] = dav1d_cdef_filter_4x8_vsx; + c->fb[2] = dav1d_cdef_filter_4x4_vsx; +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/ppc/cdef_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/ppc/cdef_tmpl.c new file mode 100644 index 00000000000..e2e759810f7 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/ppc/cdef_tmpl.c @@ -0,0 +1,487 @@ +/* + * Copyright © 2019, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/ppc/dav1d_types.h" +#include "src/ppc/cdef.h" + +#if BITDEPTH == 8 +static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, + const int damping) +{ + const i16x8 zero = vec_splat_s16(0); + if (!threshold) return zero; + const uint16_t shift = imax(0, damping - ulog2(threshold)); + const i16x8 abs_diff = vec_abs(diff); + const b16x8 mask = vec_cmplt(diff, zero); + const i16x8 thr = vec_splats(threshold); + const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift))); + const i16x8 max = vec_max(zero, sub); + const i16x8 min = vec_min(abs_diff, max); + const i16x8 neg = vec_sub(zero, min); + return vec_sel(min, neg, mask); +} + +static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride, + const uint8_t *src, const ptrdiff_t src_stride, + const uint8_t (*left)[2], const uint8_t *const top, + const uint8_t *const bottom, const int w, const int h, + const enum CdefEdgeFlags edges) +{ + const u16x8 fill = vec_splats((uint16_t)INT16_MAX); + + u16x8 l0; + u16x8 l1; + + int y_start = -2, y_end = h + 2; + + // Copy top and bottom first + if (!(edges & CDEF_HAVE_TOP)) { + l0 = fill; + l1 = fill; + y_start = 0; + } else { + l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2)); + l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2)); + } + + vec_st(l0, 0, tmp - 2 * 8); + vec_st(l1, 0, tmp - 1 * 8); + + if (!(edges & CDEF_HAVE_BOTTOM)) { + l0 = fill; + l1 = fill; + y_end -= 2; + } else { + l0 = u8h_to_u16(vec_vsx_ld(0, bottom + 0 * src_stride - 2)); + l1 = u8h_to_u16(vec_vsx_ld(0, bottom + 1 * src_stride - 2)); + } + + vec_st(l0, 0, tmp + (h + 0) * 8); + vec_st(l1, 0, tmp + (h + 1) * 8); + + int y_with_left_edge = 0; + if (!(edges & CDEF_HAVE_LEFT)) { + u16x8 l = u8h_to_u16(vec_vsx_ld(0, src)); + vec_vsx_st(l, 0, tmp + 2); + + y_with_left_edge = 1; + } + + for (int y = y_with_left_edge; y < h; y++) { + u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride)); + vec_st(l, 0, tmp + y * 8); + } + + if (!(edges & CDEF_HAVE_LEFT)) { + for (int y = y_start; y < y_end; y++) { + tmp[y * 8] = INT16_MAX; + tmp[1 + y * 8] = INT16_MAX; + } + } else { + for (int y = 0; y < h; y++) { + tmp[y * 8] = left[y][0]; + tmp[1 + y * 8] = left[y][1]; + } + } + if (!(edges & CDEF_HAVE_RIGHT)) { + for (int y = y_start; y < y_end; y++) { + tmp[- 2 + (y + 1) * 8] = INT16_MAX; + tmp[- 1 + (y + 1) * 8] = INT16_MAX; + } + } +} + +static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride, + const uint8_t *src, const ptrdiff_t src_stride, + const uint8_t (*left)[2], const uint8_t *const top, + const uint8_t *const bottom, const int w, const int h, + const enum CdefEdgeFlags edges) +{ + const u16x8 fill = vec_splats((uint16_t)INT16_MAX); + + u16x8 l0h, l0l; + u16x8 l1h, l1l; + + int y_start = -2, y_end = h + 2; + + // Copy top and bottom first + if (!(edges & CDEF_HAVE_TOP)) { + l0h = fill; + l0l = fill; + l1h = fill; + l1l = fill; + y_start = 0; + } else { + u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2); + u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2); + l0h = u8h_to_u16(l0); + l0l = u8l_to_u16(l0); + l1h = u8h_to_u16(l1); + l1l = u8l_to_u16(l1); + } + + vec_st(l0h, 0, tmp - 4 * 8); + vec_st(l0l, 0, tmp - 3 * 8); + vec_st(l1h, 0, tmp - 2 * 8); + vec_st(l1l, 0, tmp - 1 * 8); + + if (!(edges & CDEF_HAVE_BOTTOM)) { + l0h = fill; + l0l = fill; + l1h = fill; + l1l = fill; + y_end -= 2; + } else { + u8x16 l0 = vec_vsx_ld(0, bottom + 0 * src_stride - 2); + u8x16 l1 = vec_vsx_ld(0, bottom + 1 * src_stride - 2); + l0h = u8h_to_u16(l0); + l0l = u8l_to_u16(l0); + l1h = u8h_to_u16(l1); + l1l = u8l_to_u16(l1); + } + + vec_st(l0h, 0, tmp + (h + 0) * 16); + vec_st(l0l, 0, tmp + (h + 0) * 16 + 8); + vec_st(l1h, 0, tmp + (h + 1) * 16); + vec_st(l1l, 0, tmp + (h + 1) * 16 + 8); + + int y_with_left_edge = 0; + if (!(edges & CDEF_HAVE_LEFT)) { + u8x16 l = vec_vsx_ld(0, src); + u16x8 lh = u8h_to_u16(l); + u16x8 ll = u8l_to_u16(l); + vec_vsx_st(lh, 0, tmp + 2); + vec_vsx_st(ll, 0, tmp + 8 + 2); + + y_with_left_edge = 1; + } + + for (int y = y_with_left_edge; y < h; y++) { + u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride); + u16x8 lh = u8h_to_u16(l); + u16x8 ll = u8l_to_u16(l); + vec_st(lh, 0, tmp + y * 16); + vec_st(ll, 0, tmp + 8 + y * 16); + } + + if (!(edges & CDEF_HAVE_LEFT)) { + for (int y = y_start; y < y_end; y++) { + tmp[y * 16] = INT16_MAX; + tmp[1 + y * 16] = INT16_MAX; + } + } else { + for (int y = 0; y < h; y++) { + tmp[y * 16] = left[y][0]; + tmp[1 + y * 16] = left[y][1]; + } + } + if (!(edges & CDEF_HAVE_RIGHT)) { + for (int y = y_start; y < y_end; y++) { + tmp[- 6 + (y + 1) * 16] = INT16_MAX; + tmp[- 5 + (y + 1) * 16] = INT16_MAX; + } + } +} + +static inline i16x8 max_mask(i16x8 a, i16x8 b) { + const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX); + + const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX); + + const i16x8 val = vec_sel(a, b, mask); + + return vec_max(val, b); +} + +#define LOAD_PIX(addr) \ + const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \ + i16x8 max = px; \ + i16x8 min = px; \ + i16x8 sum = vec_splat_s16(0); + +#define LOAD_PIX4(addr) \ + const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \ + const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \ + const i16x8 px = vec_xxpermdi(a, b, 0); \ + i16x8 max = px; \ + i16x8 min = px; \ + i16x8 sum = vec_splat_s16(0); + +#define LOAD_DIR(p, addr, o0, o1) \ + const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \ + const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \ + const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \ + const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1); + +#define LOAD_DIR4(p, addr, o0, o1) \ + LOAD_DIR(p ## a, addr, o0, o1) \ + LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \ + const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \ + const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \ + const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \ + const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0); + +#define CONSTRAIN(p, strength) \ + const i16x8 p ## _d0 = vec_sub(p ## 0, px); \ + const i16x8 p ## _d1 = vec_sub(p ## 1, px); \ + const i16x8 p ## _d2 = vec_sub(p ## 2, px); \ + const i16x8 p ## _d3 = vec_sub(p ## 3, px); \ +\ + i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \ + i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \ + i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \ + i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping); + +#define MIN_MAX(p) \ + max = max_mask(p ## 0, max); \ + min = vec_min(p ## 0, min); \ + max = max_mask(p ## 1, max); \ + min = vec_min(p ## 1, min); \ + max = max_mask(p ## 2, max); \ + min = vec_min(p ## 2, min); \ + max = max_mask(p ## 3, max); \ + min = vec_min(p ## 3, min); + +#define PRI_0(p) \ + p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \ + p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even))); + +#define PRI_1(p) \ + p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \ + p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even))); + +#define SEC_0(p) \ + p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \ + p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \ + p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \ + p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1)); + +#define UPDATE_SUM(p) \ + const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \ + const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \ + sum = vec_add(sum, p ## sum0); \ + sum = vec_add(sum, p ## sum1); + +static inline void +filter_4xN(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int pri_strength, const int sec_strength, const int dir, + const int damping, const enum CdefEdgeFlags edges, + const ptrdiff_t tmp_stride, uint16_t *tmp) +{ + const int8_t cdef_directions[8 /* dir */][2 /* pass */] = { + { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, + { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, + { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, + { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, + { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } + }; + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1); + const int off1 = cdef_directions[dir][0]; + const int off1_1 = cdef_directions[dir][1]; + + const int off2 = cdef_directions[(dir + 2) & 7][0]; + const int off3 = cdef_directions[(dir + 6) & 7][0]; + + const int off2_1 = cdef_directions[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions[(dir + 6) & 7][1]; + + copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); + + for (int y = 0; y < h / 2; y++) { + LOAD_PIX4(tmp) + + // Primary pass + LOAD_DIR4(p, tmp, off1, off1_1) + + CONSTRAIN(p, pri_strength) + + MIN_MAX(p) + + PRI_0(p) + PRI_1(p) + + UPDATE_SUM(p) + + // Secondary pass 1 + LOAD_DIR4(s, tmp, off2, off3) + + CONSTRAIN(s, sec_strength) + + MIN_MAX(s) + + SEC_0(s) + + UPDATE_SUM(s) + + // Secondary pass 2 + LOAD_DIR4(s2, tmp, off2_1, off3_1) + + CONSTRAIN(s2, sec_strength) + + MIN_MAX(s2) + + UPDATE_SUM(s2) + + // Store + i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); + bias = vec_sub(vec_splat_s16(8), bias); + i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); + i16x8 vdst = vec_max(vec_min(unclamped, max), min); + + dst[0] = vdst[0]; + dst[1] = vdst[1]; + dst[2] = vdst[2]; + dst[3] = vdst[3]; + + tmp += tmp_stride; + dst += PXSTRIDE(dst_stride); + dst[0] = vdst[4]; + dst[1] = vdst[5]; + dst[2] = vdst[6]; + dst[3] = vdst[7]; + + tmp += tmp_stride; + dst += PXSTRIDE(dst_stride); + } +} + +static inline void +filter_8xN(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int pri_strength, const int sec_strength, const int dir, + const int damping, const enum CdefEdgeFlags edges, + const ptrdiff_t tmp_stride, uint16_t *tmp) +{ + const int8_t cdef_directions[8 /* dir */][2 /* pass */] = { + { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, + { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, + { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, + { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, + { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, + { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } + }; + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + + + const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1); + const int off1 = cdef_directions[dir][0]; + const int off1_1 = cdef_directions[dir][1]; + + const int off2 = cdef_directions[(dir + 2) & 7][0]; + const int off3 = cdef_directions[(dir + 6) & 7][0]; + + const int off2_1 = cdef_directions[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions[(dir + 6) & 7][1]; + + copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); + + for (int y = 0; y < h; y++) { + LOAD_PIX(tmp) + + // Primary pass + LOAD_DIR(p, tmp, off1, off1_1) + + CONSTRAIN(p, pri_strength) + + MIN_MAX(p) + + PRI_0(p) + PRI_1(p) + + UPDATE_SUM(p) + + // Secondary pass 1 + LOAD_DIR(s, tmp, off2, off3) + + CONSTRAIN(s, sec_strength) + + MIN_MAX(s) + + SEC_0(s) + + UPDATE_SUM(s) + + // Secondary pass 2 + LOAD_DIR(s2, tmp, off2_1, off3_1) + + CONSTRAIN(s2, sec_strength) + + MIN_MAX(s2) + + UPDATE_SUM(s2) + + // Store + i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); + bias = vec_sub(vec_splat_s16(8), bias); + i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); + i16x8 vdst = vec_max(vec_min(unclamped, max), min); + + dst[0] = vdst[0]; + dst[1] = vdst[1]; + dst[2] = vdst[2]; + dst[3] = vdst[3]; + dst[4] = vdst[4]; + dst[5] = vdst[5]; + dst[6] = vdst[6]; + dst[7] = vdst[7]; + + tmp += tmp_stride; + dst += PXSTRIDE(dst_stride); + } + +} + +#define cdef_fn(w, h, tmp_stride) \ +void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ + const ptrdiff_t dst_stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const pixel *const bottom, \ + const int pri_strength, \ + const int sec_strength, \ + const int dir, \ + const int damping, \ + const enum CdefEdgeFlags edges) \ +{ \ + ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ + uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ + filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ + sec_strength, dir, damping, edges, tmp_stride, tmp); \ +} + +cdef_fn(4, 4, 8); +cdef_fn(4, 8, 8); +cdef_fn(8, 8, 16); +#endif diff --git a/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration.h new file mode 100644 index 00000000000..3fe16318bd5 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration.h @@ -0,0 +1,48 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Michail Alvanos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "common/intops.h" + +#include "src/cpu.h" +#include "src/looprestoration.h" + +void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride, + const uint8_t (*const left)[4], + const uint8_t *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + +static ALWAYS_INLINE void loop_restoration_dsp_init_ppc(Dav1dLoopRestorationDSPContext *const c, const int bpc) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return; + +#if BITDEPTH == 8 + c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_vsx; +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration_tmpl.c new file mode 100644 index 00000000000..c0c64e18002 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration_tmpl.c @@ -0,0 +1,321 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Michail Alvanos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/ppc/dav1d_types.h" +#include "src/ppc/looprestoration.h" + +#if BITDEPTH == 8 + +#define REST_UNIT_STRIDE (400) + +static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) { + v = vec_max(minv, v); + v = vec_min(maxv, v); + return v; +} + +#define APPLY_FILTER_H(v, f, ssum1, ssum2) do { \ + i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \ + i16x8 ktmp_u16_low = (i16x8) u8l_to_u16(v); \ + ssum1 = vec_madd(ktmp_u16_high, f, ssum1); \ + ssum2 = vec_madd(ktmp_u16_low, f, ssum2); \ +} while (0) + +static void wiener_filter_h_vsx(int32_t *hor_ptr, + uint8_t *tmp_ptr, + const int16_t filterh[8], + const int w, const int h) +{ + const i32x4 zerov = vec_splats(0); + const i32x4 seven_vec = vec_splats(7); + const i32x4 bitdepth_added_vec = vec_splats(1 << 14); + const i32x4 round_bits_vec = vec_splats(3); + const i32x4 rounding_off_vec = vec_splats(1<<2); + const i32x4 clip_limit_v = vec_splats((1 << 13) - 1); + + i16x8 filterhvall = vec_vsx_ld(0, filterh); + i16x8 filterhv0 = vec_splat( filterhvall, 0); + i16x8 filterhv1 = vec_splat( filterhvall, 1); + i16x8 filterhv2 = vec_splat( filterhvall, 2); + i16x8 filterhv3 = vec_splat( filterhvall, 3); + i16x8 filterhv4 = vec_splat( filterhvall, 4); + i16x8 filterhv5 = vec_splat( filterhvall, 5); + i16x8 filterhv6 = vec_splat( filterhvall, 6); + + for (int j = 0; j < h + 6; j++) { + for (int i = 0; i < w; i+=16) { + i32x4 sum1 = bitdepth_added_vec; + i32x4 sum2 = bitdepth_added_vec; + i32x4 sum3 = bitdepth_added_vec; + i32x4 sum4 = bitdepth_added_vec; + + u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]); + u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]); + + u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15); + u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14); + u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13); + u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12); + u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11); + u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10); + + u16x8 tmp_u16_high = u8h_to_u16(tmp_v3); + u16x8 tmp_u16_low = u8l_to_u16(tmp_v3); + + i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high); + i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high); + i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low); + i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low); + + i16x8 ssum1 = (i16x8) zerov; + i16x8 ssum2 = (i16x8) zerov; + + APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2); + APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2); + APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2); + APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2); + APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2); + APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2); + APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2); + + sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec); + sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec); + sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec); + sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec); + + sum1 = (sum1 + rounding_off_vec) >> round_bits_vec; + sum2 = (sum2 + rounding_off_vec) >> round_bits_vec; + sum3 = (sum3 + rounding_off_vec) >> round_bits_vec; + sum4 = (sum4 + rounding_off_vec) >> round_bits_vec; + + sum1 = iclip_vec(sum1, zerov, clip_limit_v); + sum2 = iclip_vec(sum2, zerov, clip_limit_v); + sum3 = iclip_vec(sum3, zerov, clip_limit_v); + sum4 = iclip_vec(sum4, zerov, clip_limit_v); + + vec_st(sum1, 0, &hor_ptr[i]); + vec_st(sum2, 16, &hor_ptr[i]); + vec_st(sum3, 32, &hor_ptr[i]); + vec_st(sum4, 48, &hor_ptr[i]); + } + tmp_ptr += REST_UNIT_STRIDE; + hor_ptr += REST_UNIT_STRIDE; + } +} + +static inline i16x8 iclip_u8_vec(i16x8 v) { + const i16x8 zerov = vec_splats((int16_t)0); + const i16x8 maxv = vec_splats((int16_t)255); + v = vec_max(zerov, v); + v = vec_min(maxv, v); + return v; +} + +#define APPLY_FILTER_V(index, f) do { \ + i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \ + i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \ + i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \ + i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \ + sum1 = sum1 + v1 * f; \ + sum2 = sum2 + v2 * f; \ + sum3 = sum3 + v3 * f; \ + sum4 = sum4 + v4 * f; \ +} while (0) + +#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \ + i32x4 sum1 = round_vec; \ + i32x4 sum2 = round_vec; \ + i32x4 sum3 = round_vec; \ + i32x4 sum4 = round_vec; \ + APPLY_FILTER_V(0, filterv0); \ + APPLY_FILTER_V(1, filterv1); \ + APPLY_FILTER_V(2, filterv2); \ + APPLY_FILTER_V(3, filterv3); \ + APPLY_FILTER_V(4, filterv4); \ + APPLY_FILTER_V(5, filterv5); \ + APPLY_FILTER_V(6, filterv6); \ + sum1 = sum1 >> round_bits_vec; \ + sum2 = sum2 >> round_bits_vec; \ + sum3 = sum3 >> round_bits_vec; \ + sum4 = sum4 >> round_bits_vec; \ + i16x8 sum_short_packed_1 = (i16x8) vec_pack(sum1, sum2); \ + i16x8 sum_short_packed_2 = (i16x8) vec_pack(sum3, sum4); \ + sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \ + sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \ + sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2); \ +} while (0) + +static inline void wiener_filter_v_vsx(uint8_t *p, + const ptrdiff_t stride, + const int32_t *hor, + const int16_t filterv[8], + const int w, const int h) +{ + const i32x4 round_bits_vec = vec_splats(11); + const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18)); + + i32x4 filterv0 = vec_splats((int32_t) filterv[0]); + i32x4 filterv1 = vec_splats((int32_t) filterv[1]); + i32x4 filterv2 = vec_splats((int32_t) filterv[2]); + i32x4 filterv3 = vec_splats((int32_t) filterv[3]); + i32x4 filterv4 = vec_splats((int32_t) filterv[4]); + i32x4 filterv5 = vec_splats((int32_t) filterv[5]); + i32x4 filterv6 = vec_splats((int32_t) filterv[6]); + + for (int j = 0; j < h; j++) { + for (int i = 0; i <(w-w%16); i += 16) { + u8x16 sum_pixel; + LOAD_AND_APPLY_FILTER_V(sum_pixel, hor); + vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(stride) + i]); + } + // remaining loop + if (w & 0xf){ + int i=w-w%16; + ALIGN_STK_16(uint8_t, tmp_out, 16,); + u8x16 sum_pixel; + + LOAD_AND_APPLY_FILTER_V(sum_pixel, hor); + vec_vsx_st(sum_pixel, 0, tmp_out); + + for (int k=0; i<w; i++, k++) { + p[j * PXSTRIDE(stride) + i] = tmp_out[k]; + } + } + } +} + +static inline void padding(uint8_t *dst, const uint8_t *p, + const ptrdiff_t stride, const uint8_t (*left)[4], + const uint8_t *lpf, int unit_w, const int stripe_h, + const enum LrEdgeFlags edges) +{ + const int have_left = !!(edges & LR_HAVE_LEFT); + const int have_right = !!(edges & LR_HAVE_RIGHT); + + // Copy more pixels if we don't have to pad them + unit_w += 3 * have_left + 3 * have_right; + uint8_t *dst_l = dst + 3 * !have_left; + p -= 3 * have_left; + lpf -= 3 * have_left; + + if (edges & LR_HAVE_TOP) { + // Copy previous loop filtered rows + const uint8_t *const above_1 = lpf; + const uint8_t *const above_2 = above_1 + PXSTRIDE(stride); + pixel_copy(dst_l, above_1, unit_w); + pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w); + } else { + // Pad with first row + pixel_copy(dst_l, p, unit_w); + pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w); + if (have_left) { + pixel_copy(dst_l, &left[0][1], 3); + pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3); + pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3); + } + } + + uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE; + if (edges & LR_HAVE_BOTTOM) { + // Copy next loop filtered rows + const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride); + const uint8_t *const below_2 = below_1 + PXSTRIDE(stride); + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w); + } else { + // Pad with last row + const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride); + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w); + if (have_left) { + pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); + } + } + + // Inner UNIT_WxSTRIPE_H + for (int j = 0; j < stripe_h; j++) { + pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left); + dst_tl += REST_UNIT_STRIDE; + p += PXSTRIDE(stride); + } + + if (!have_right) { + uint8_t *pad = dst_l + unit_w; + uint8_t *row_last = &dst_l[unit_w - 1]; + // Pad 3x(STRIPE_H+6) with last column + for (int j = 0; j < stripe_h + 6; j++) { + pixel_set(pad, *row_last, 3); + pad += REST_UNIT_STRIDE; + row_last += REST_UNIT_STRIDE; + } + } + + if (!have_left) { + // Pad 3x(STRIPE_H+6) with first column + for (int j = 0; j < stripe_h + 6; j++) { + pixel_set(dst, *dst_l, 3); + dst += REST_UNIT_STRIDE; + dst_l += REST_UNIT_STRIDE; + } + } else { + dst += 3 * REST_UNIT_STRIDE; + for (int j = 0; j < stripe_h; j++) { + pixel_copy(dst, &left[j][1], 3); + dst += REST_UNIT_STRIDE; + } + } +} + +// FIXME Could split into luma and chroma specific functions, +// (since first and last tops are always 0 for chroma) +// FIXME Could implement a version that requires less temporary memory +// (should be possible to implement with only 6 rows of temp storage) +void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride, + const uint8_t (*const left)[4], + const uint8_t *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + const int16_t (*const filter)[8] = params->filter; + + // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels + // of padding above and below + ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); + padding(tmp, p, stride, left, lpf, w, h, edges); + ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,); + + wiener_filter_h_vsx(hor, tmp, filter[0], w, h); + wiener_filter_v_vsx(p, stride, hor, filter[1], w, h); +} +#endif diff --git a/chromium/third_party/dav1d/libdav1d/src/qm.h b/chromium/third_party/dav1d/libdav1d/src/qm.h index 23b2348a70c..8191c8afa77 100644 --- a/chromium/third_party/dav1d/libdav1d/src/qm.h +++ b/chromium/third_party/dav1d/libdav1d/src/qm.h @@ -30,7 +30,7 @@ #include "src/levels.h" -extern const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES]; +EXTERN const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES]; void dav1d_init_qm_tables(void); diff --git a/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c index 0ed4169aa00..3158ef5b023 100644 --- a/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c +++ b/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c @@ -591,7 +591,7 @@ static int decode_coefs(Dav1dTaskContext *const t, const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane]; const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL; const int dq_shift = imax(0, t_dim->ctx - 2); - const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc)); + const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc)); unsigned cul_level, dc_sign_level; if (!dc_tok) { @@ -608,7 +608,7 @@ static int decode_coefs(Dav1dTaskContext *const t, printf("Post-dc_sign[%d][%d][%d]: r=%d\n", chroma, dc_sign_ctx, dc_sign, ts->msac.rng); - unsigned dc_dq = dq_tbl[0]; + int dc_dq = dq_tbl[0]; dc_sign_level = (dc_sign - 1) & (2 << 6); if (qm_tbl) { @@ -628,7 +628,8 @@ static int decode_coefs(Dav1dTaskContext *const t, } cul_level = dc_tok; dc_dq >>= dq_shift; - cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign); + dc_dq = umin(dc_dq, cf_max + dc_sign); + cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq); if (rc) ac_qm: { const unsigned ac_dq = dq_tbl[1]; @@ -638,6 +639,7 @@ static int decode_coefs(Dav1dTaskContext *const t, printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); const unsigned rc_tok = cf[rc]; unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5; + int dq_sat; if (rc_tok >= (15 << 11)) { tok = read_golomb(&ts->msac) + 15; @@ -654,7 +656,8 @@ static int decode_coefs(Dav1dTaskContext *const t, } cul_level += tok; dq >>= dq_shift; - cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign); + dq_sat = umin(dq, cf_max + sign); + cf[rc] = (coef) (sign ? -dq_sat : dq_sat); rc = rc_tok & 0x3ff; } while (rc); @@ -669,13 +672,13 @@ static int decode_coefs(Dav1dTaskContext *const t, dc_tok &= 0xfffff; dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift; - dc_dq = umin(dc_dq - dc_sign, cf_max); + dc_dq = umin(dc_dq, cf_max + dc_sign); } else { - dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign; + dc_dq = ((dc_dq * dc_tok) >> dq_shift); assert(dc_dq <= cf_max); } cul_level = dc_tok; - cf[0] = (coef) (dc_dq ^ -dc_sign); + cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq); if (rc) ac_noqm: { const unsigned ac_dq = dq_tbl[1]; @@ -684,7 +687,8 @@ static int decode_coefs(Dav1dTaskContext *const t, if (dbg) printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); const unsigned rc_tok = cf[rc]; - unsigned tok, dq; + unsigned tok; + int dq; // residual if (rc_tok >= (15 << 11)) { @@ -698,15 +702,15 @@ static int decode_coefs(Dav1dTaskContext *const t, // dequant, see 7.12.3 dq = ((ac_dq * tok) & 0xffffff) >> dq_shift; - dq = umin(dq - sign, cf_max); + dq = umin(dq, cf_max + sign); } else { // cannot exceed cf_max, so we can avoid the clipping tok = rc_tok >> 11; - dq = ((ac_dq * tok) >> dq_shift) - sign; + dq = ((ac_dq * tok) >> dq_shift); assert(dq <= cf_max); } cul_level += tok; - cf[rc] = (coef) (dq ^ -sign); + cf[rc] = (coef) (sign ? -dq : dq); rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob } while (rc); @@ -1092,9 +1096,10 @@ static int obmc(Dav1dTaskContext *const t, // only odd blocks are considered for overlap handling, hence +1 const refmvs_block *const a_r = &r[-1][t->bx + x + 1]; const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs]; + const int step4 = iclip(a_b_dim[0], 2, 16); if (a_r->ref.ref[0] > 0) { - const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]); + const int ow4 = imin(step4, b_dim[0]); const int oh4 = imin(b_dim[1], 16) >> 1; res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2, t->bx + x, t->by, pl, a_r->mv.mv[0], @@ -1105,7 +1110,7 @@ static int obmc(Dav1dTaskContext *const t, h_mul * ow4, v_mul * oh4); i++; } - x += imax(a_b_dim[0], 2); + x += step4; } } @@ -1114,10 +1119,11 @@ static int obmc(Dav1dTaskContext *const t, // only odd blocks are considered for overlap handling, hence +1 const refmvs_block *const l_r = &r[y + 1][t->bx - 1]; const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs]; + const int step4 = iclip(l_b_dim[1], 2, 16); if (l_r->ref.ref[0] > 0) { const int ow4 = imin(b_dim[0], 16) >> 1; - const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]); + const int oh4 = imin(step4, b_dim[1]); res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4, t->bx, t->by + y, pl, l_r->mv.mv[0], &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1, @@ -1127,7 +1133,7 @@ static int obmc(Dav1dTaskContext *const t, dst_stride, lap, h_mul * ow4, v_mul * oh4); i++; } - y += imax(l_b_dim[1], 2); + y += step4; } return 0; } diff --git a/chromium/third_party/dav1d/libdav1d/src/ref.c b/chromium/third_party/dav1d/libdav1d/src/ref.c index 3889cba5657..46462b4c801 100644 --- a/chromium/third_party/dav1d/libdav1d/src/ref.c +++ b/chromium/third_party/dav1d/libdav1d/src/ref.c @@ -88,22 +88,18 @@ Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr, return res; } -void dav1d_ref_inc(Dav1dRef *const ref) { - atomic_fetch_add(&ref->ref_cnt, 1); -} - void dav1d_ref_dec(Dav1dRef **const pref) { assert(pref != NULL); Dav1dRef *const ref = *pref; if (!ref) return; + *pref = NULL; if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) { const int free_ref = ref->free_ref; ref->free_callback(ref->const_data, ref->user_data); if (free_ref) free(ref); } - *pref = NULL; } int dav1d_ref_is_writable(Dav1dRef *const ref) { diff --git a/chromium/third_party/dav1d/libdav1d/src/ref.h b/chromium/third_party/dav1d/libdav1d/src/ref.h index 54f5f69f888..ec070a0a9a3 100644 --- a/chromium/third_party/dav1d/libdav1d/src/ref.h +++ b/chromium/third_party/dav1d/libdav1d/src/ref.h @@ -50,9 +50,11 @@ Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size); Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr, void (*free_callback)(const uint8_t *data, void *user_data), void *user_data); -void dav1d_ref_inc(Dav1dRef *ref); void dav1d_ref_dec(Dav1dRef **ref); - int dav1d_ref_is_writable(Dav1dRef *ref); +static inline void dav1d_ref_inc(Dav1dRef *const ref) { + atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed); +} + #endif /* DAV1D_SRC_REF_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/refmvs.c b/chromium/third_party/dav1d/libdav1d/src/refmvs.c index d49ebaeec6b..c7ed9db8cac 100644 --- a/chromium/third_party/dav1d/libdav1d/src/refmvs.c +++ b/chromium/third_party/dav1d/libdav1d/src/refmvs.c @@ -922,15 +922,23 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, } while (--bh4); } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/refmvs.h" +#elif ARCH_X86 +#include "src/x86/refmvs.h" +#endif +#endif + COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c) { c->splat_mv = splat_mv_c; #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - dav1d_refmvs_dsp_init_arm(c); + refmvs_dsp_init_arm(c); #elif ARCH_X86 - dav1d_refmvs_dsp_init_x86(c); + refmvs_dsp_init_x86(c); #endif #endif } diff --git a/chromium/third_party/dav1d/libdav1d/src/scan.h b/chromium/third_party/dav1d/libdav1d/src/scan.h index ca9743fd5ab..09df9887799 100644 --- a/chromium/third_party/dav1d/libdav1d/src/scan.h +++ b/chromium/third_party/dav1d/libdav1d/src/scan.h @@ -32,6 +32,6 @@ #include "src/levels.h" -extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES]; +EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES]; #endif /* DAV1D_SRC_SCAN_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/tables.h b/chromium/third_party/dav1d/libdav1d/src/tables.h index 894f8c237d9..f3c00cfb00a 100644 --- a/chromium/third_party/dav1d/libdav1d/src/tables.h +++ b/chromium/third_party/dav1d/libdav1d/src/tables.h @@ -34,38 +34,38 @@ #include "src/levels.h" -extern const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS]; -extern const uint8_t /* enum BlockSize */ +EXTERN const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS]; +EXTERN const uint8_t /* enum BlockSize */ dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2]; // width, height (in 4px blocks), log2 versions of these two -extern const uint8_t dav1d_block_dimensions[N_BS_SIZES][4]; +EXTERN const uint8_t dav1d_block_dimensions[N_BS_SIZES][4]; typedef struct TxfmInfo { // width, height (in 4px blocks), log2 of them, min/max of log2, sub, pad uint8_t w, h, lw, lh, min, max, sub, ctx; } TxfmInfo; -extern const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES]; -extern const uint8_t /* enum (Rect)TxfmSize */ +EXTERN const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES]; +EXTERN const uint8_t /* enum (Rect)TxfmSize */ dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */]; -extern const uint8_t /* enum TxfmType */ +EXTERN const uint8_t /* enum TxfmType */ dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES]; -extern const uint8_t /* enum InterPredMode */ +EXTERN const uint8_t /* enum InterPredMode */ dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2]; -extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS]; -extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40]; +EXTERN const uint8_t dav1d_partition_type_count[N_BL_LEVELS]; +EXTERN const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40]; -extern const uint8_t dav1d_filter_mode_to_y_mode[5]; -extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES]; -extern const uint8_t dav1d_lo_ctx_offsets[3][5][5]; -extern const uint8_t dav1d_skip_ctx[5][5]; -extern const uint8_t /* enum TxClass */ +EXTERN const uint8_t dav1d_filter_mode_to_y_mode[5]; +EXTERN const uint8_t dav1d_ymode_size_context[N_BS_SIZES]; +EXTERN const uint8_t dav1d_lo_ctx_offsets[3][5][5]; +EXTERN const uint8_t dav1d_skip_ctx[5][5]; +EXTERN const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL]; -extern const uint8_t /* enum Filter2d */ +EXTERN const uint8_t /* enum Filter2d */ dav1d_filter_2d[DAV1D_N_FILTERS /* h */][DAV1D_N_FILTERS /* v */]; -extern const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2]; -extern const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES]; -extern const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES]; +EXTERN const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2]; +EXTERN const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES]; +EXTERN const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES]; static const unsigned cfl_allowed_mask = (1 << BS_32x32) | @@ -103,23 +103,23 @@ static const unsigned interintra_allowed_mask = (1 << BS_8x16) | (1 << BS_8x8); -extern const Dav1dWarpedMotionParams dav1d_default_wm_params; +EXTERN const Dav1dWarpedMotionParams dav1d_default_wm_params; -extern const int8_t dav1d_cdef_directions[12][2]; +EXTERN const int8_t dav1d_cdef_directions[12][2]; -extern const uint16_t dav1d_sgr_params[16][2]; -extern const uint8_t dav1d_sgr_x_by_x[256]; +EXTERN const uint16_t dav1d_sgr_params[16][2]; +EXTERN const uint8_t dav1d_sgr_x_by_x[256]; -extern const int8_t dav1d_mc_subpel_filters[6][15][8]; -extern const int8_t dav1d_mc_warp_filter[193][8]; -extern const int8_t dav1d_resize_filter[64][8]; +EXTERN const int8_t dav1d_mc_subpel_filters[6][15][8]; +EXTERN const int8_t dav1d_mc_warp_filter[193][8]; +EXTERN const int8_t dav1d_resize_filter[64][8]; -extern const uint8_t dav1d_sm_weights[128]; -extern const uint16_t dav1d_dr_intra_derivative[44]; -extern const int8_t dav1d_filter_intra_taps[5][64]; +EXTERN const uint8_t dav1d_sm_weights[128]; +EXTERN const uint16_t dav1d_dr_intra_derivative[44]; +EXTERN const int8_t dav1d_filter_intra_taps[5][64]; -extern const uint8_t dav1d_obmc_masks[64]; +EXTERN const uint8_t dav1d_obmc_masks[64]; -extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs +EXTERN const int16_t dav1d_gaussian_sequence[2048]; // for fgs #endif /* DAV1D_SRC_TABLES_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/thread_task.c b/chromium/third_party/dav1d/libdav1d/src/thread_task.c index 53aa41e5c8a..ab2376c30a4 100644 --- a/chromium/third_party/dav1d/libdav1d/src/thread_task.c +++ b/chromium/third_party/dav1d/libdav1d/src/thread_task.c @@ -49,9 +49,13 @@ static inline int reset_task_cur(const Dav1dContext *const c, unsigned frame_idx) { const unsigned first = atomic_load(&ttd->first); + unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX); + if (reset_frame_idx < first) { + if (frame_idx == UINT_MAX) return 0; + reset_frame_idx = UINT_MAX; + } if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL) return 0; - unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX); if (reset_frame_idx != UINT_MAX) { if (frame_idx == UINT_MAX) { if (reset_frame_idx > first + ttd->cur) @@ -78,12 +82,17 @@ cur_found: static inline void reset_task_cur_async(struct TaskThreadData *const ttd, unsigned frame_idx, unsigned n_frames) { - if (frame_idx < (unsigned)atomic_load(&ttd->first)) frame_idx += n_frames; + const unsigned first = atomic_load(&ttd->first); + if (frame_idx < first) frame_idx += n_frames; unsigned last_idx = frame_idx; do { frame_idx = last_idx; last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx); } while (last_idx < frame_idx); + if (frame_idx == first && atomic_load(&ttd->first) != first) { + unsigned expected = frame_idx; + atomic_compare_exchange_strong(&ttd->reset_task_cur, &expected, UINT_MAX); + } } static void insert_tasks_between(Dav1dFrameContext *const f, @@ -164,6 +173,43 @@ static inline void insert_task(Dav1dFrameContext *const f, insert_tasks(f, t, t, cond_signal); } +static inline void add_pending(Dav1dFrameContext *const f, Dav1dTask *const t) { + pthread_mutex_lock(&f->task_thread.pending_tasks.lock); + t->next = NULL; + if (!f->task_thread.pending_tasks.head) + f->task_thread.pending_tasks.head = t; + else + f->task_thread.pending_tasks.tail->next = t; + f->task_thread.pending_tasks.tail = t; + atomic_store(&f->task_thread.pending_tasks.merge, 1); + pthread_mutex_unlock(&f->task_thread.pending_tasks.lock); +} + +static inline int merge_pending_frame(Dav1dFrameContext *const f) { + int const merge = atomic_load(&f->task_thread.pending_tasks.merge); + if (merge) { + pthread_mutex_lock(&f->task_thread.pending_tasks.lock); + Dav1dTask *t = f->task_thread.pending_tasks.head; + f->task_thread.pending_tasks.head = NULL; + f->task_thread.pending_tasks.tail = NULL; + atomic_store(&f->task_thread.pending_tasks.merge, 0); + pthread_mutex_unlock(&f->task_thread.pending_tasks.lock); + while (t) { + Dav1dTask *const tmp = t->next; + insert_task(f, t, 0); + t = tmp; + } + } + return merge; +} + +static inline int merge_pending(const Dav1dContext *const c) { + int res = 0; + for (unsigned i = 0; i < c->n_fc; i++) + res |= merge_pending_frame(&c->fc[i]); + return res; +} + static int create_filter_sbrow(Dav1dFrameContext *const f, const int pass, Dav1dTask **res_t) { @@ -192,13 +238,14 @@ static int create_filter_sbrow(Dav1dFrameContext *const f, const int prog_sz = ((f->sbh + 31) & ~31) >> 5; if (prog_sz > f->frame_thread.prog_sz) { atomic_uint *const prog = realloc(f->frame_thread.frame_progress, - prog_sz * 2 * sizeof(*prog)); + 2 * prog_sz * sizeof(*prog)); if (!prog) return -1; f->frame_thread.frame_progress = prog; f->frame_thread.copy_lpf_progress = prog + prog_sz; - f->frame_thread.prog_sz = prog_sz; } - memset(f->frame_thread.frame_progress, 0, prog_sz * 2 * sizeof(atomic_uint)); + f->frame_thread.prog_sz = prog_sz; + memset(f->frame_thread.frame_progress, 0, prog_sz * sizeof(atomic_uint)); + memset(f->frame_thread.copy_lpf_progress, 0, prog_sz * sizeof(atomic_uint)); atomic_store(&f->frame_thread.deblock_progress, 0); } f->frame_thread.next_tile_row[pass & 1] = 0; @@ -224,16 +271,18 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass, Dav1dTask *tasks = f->task_thread.tile_tasks[0]; const int uses_2pass = f->c->n_fc > 1; const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; - int alloc_num_tasks = num_tasks * (1 + uses_2pass); - if (alloc_num_tasks > f->task_thread.num_tile_tasks) { - const size_t size = sizeof(Dav1dTask) * alloc_num_tasks; - tasks = realloc(f->task_thread.tile_tasks[0], size); - if (!tasks) return -1; - memset(tasks, 0, size); - f->task_thread.tile_tasks[0] = tasks; - f->task_thread.num_tile_tasks = alloc_num_tasks; + if (pass < 2) { + int alloc_num_tasks = num_tasks * (1 + uses_2pass); + if (alloc_num_tasks > f->task_thread.num_tile_tasks) { + const size_t size = sizeof(Dav1dTask) * alloc_num_tasks; + tasks = realloc(f->task_thread.tile_tasks[0], size); + if (!tasks) return -1; + memset(tasks, 0, size); + f->task_thread.tile_tasks[0] = tasks; + f->task_thread.num_tile_tasks = alloc_num_tasks; + } + f->task_thread.tile_tasks[1] = tasks + num_tasks; } - f->task_thread.tile_tasks[1] = tasks + num_tasks; tasks += num_tasks * (pass & 1); Dav1dTask *pf_t; @@ -263,8 +312,22 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass, prev_t->next = pf_t; prev_t = pf_t; } - insert_tasks(f, &tasks[0], prev_t, cond_signal); - f->task_thread.done[pass & 1] = 0; + prev_t->next = NULL; + + atomic_store(&f->task_thread.done[pass & 1], 0); + + // XXX in theory this could be done locklessly, at this point they are no + // tasks in the frameQ, so no other runner should be using this lock, but + // we must add both passes at once + pthread_mutex_lock(&f->task_thread.pending_tasks.lock); + assert(f->task_thread.pending_tasks.head == NULL || pass == 2); + if (!f->task_thread.pending_tasks.head) + f->task_thread.pending_tasks.head = &tasks[0]; + else + f->task_thread.pending_tasks.tail->next = &tasks[0]; + f->task_thread.pending_tasks.tail = prev_t; + atomic_store(&f->task_thread.pending_tasks.merge, 1); + pthread_mutex_unlock(&f->task_thread.pending_tasks.lock); return 0; } @@ -272,7 +335,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass, void dav1d_task_frame_init(Dav1dFrameContext *const f) { const Dav1dContext *const c = f->c; - f->task_thread.init_done = 0; + atomic_store(&f->task_thread.init_done, 0); // schedule init task, which will schedule the remaining tasks Dav1dTask *const t = &f->task_thread.init_task; t->type = DAV1D_TASK_TYPE_INIT; @@ -307,16 +370,12 @@ static inline int ensure_progress(struct TaskThreadData *const ttd, // so ensure that completed. if not, re-add to task-queue; else, fall-through int p1 = atomic_load(state); if (p1 < t->sby) { + t->type = type; + t->recon_progress = t->deblock_progress = 0; + *target = t->sby; + add_pending(f, t); pthread_mutex_lock(&ttd->lock); - p1 = atomic_load(state); - if (p1 < t->sby) { - t->type = type; - t->recon_progress = t->deblock_progress = 0; - *target = t->sby; - insert_task(f, t, 0); - return 1; - } - pthread_mutex_unlock(&ttd->lock); + return 1; } return 0; } @@ -369,11 +428,29 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f, return 0; } +static inline int get_frame_progress(const Dav1dContext *const c, + const Dav1dFrameContext *const f) +{ + unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0; + if (frame_prog >= FRAME_ERROR) + return f->sbh - 1; + int idx = frame_prog >> (f->sb_shift + 7); + int prog; + do { + atomic_uint *state = &f->frame_thread.frame_progress[idx]; + const unsigned val = ~atomic_load(state); + prog = val ? ctz(val) : 32; + if (prog != 32) break; + prog = 0; + } while (++idx < f->frame_thread.prog_sz); + return ((idx << 5) | prog) - 1; +} + static inline void abort_frame(Dav1dFrameContext *const f, const int error) { atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1); - f->task_thread.task_counter = 0; - f->task_thread.done[0] = 1; - f->task_thread.done[1] = 1; + atomic_store(&f->task_thread.task_counter, 0); + atomic_store(&f->task_thread.done[0], 1); + atomic_store(&f->task_thread.done[1], 1); atomic_store(&f->sr_cur.progress[0], FRAME_ERROR); atomic_store(&f->sr_cur.progress[1], FRAME_ERROR); dav1d_decode_frame_exit(f, error); @@ -478,6 +555,8 @@ void *dav1d_worker_task(void *data) { for (;;) { if (tc->task_thread.die) break; if (atomic_load(c->flush)) goto park; + + merge_pending(c); if (ttd->delayed_fg.exec) { // run delayed film grain first delayed_fg_task(c, ttd); continue; @@ -488,11 +567,18 @@ void *dav1d_worker_task(void *data) { for (unsigned i = 0; i < c->n_fc; i++) { const unsigned first = atomic_load(&ttd->first); f = &c->fc[(first + i) % c->n_fc]; - if (f->task_thread.init_done) continue; + if (atomic_load(&f->task_thread.init_done)) continue; t = f->task_thread.task_head; if (!t) continue; if (t->type == DAV1D_TASK_TYPE_INIT) goto found; if (t->type == DAV1D_TASK_TYPE_INIT_CDF) { + // XXX This can be a simple else, if adding tasks of both + // passes at once (in dav1d_task_create_tile_sbrow). + // Adding the tasks to the pending Q can result in a + // thread merging them before setting init_done. + // We will need to set init_done before adding to the + // pending Q, so maybe return the tasks, set init_done, + // and add to pending Q only then. const int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1; if (p1) { @@ -505,6 +591,7 @@ void *dav1d_worker_task(void *data) { while (ttd->cur < c->n_fc) { // run decoding tasks last const unsigned first = atomic_load(&ttd->first); f = &c->fc[(first + ttd->cur) % c->n_fc]; + merge_pending_frame(f); prev_t = f->task_thread.task_cur_prev; t = prev_t ? prev_t->next : f->task_thread.task_head; while (t) { @@ -519,11 +606,12 @@ void *dav1d_worker_task(void *data) { } else if (t->recon_progress) { const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS; int error = atomic_load(&f->task_thread.error); - assert(!f->task_thread.done[p] || error); + assert(!atomic_load(&f->task_thread.done[p]) || error); const int tile_row_base = f->frame_hdr->tiling.cols * f->frame_thread.next_tile_row[p]; if (p) { - const int p1 = f->frame_thread.entropy_progress; + atomic_int *const prog = &f->frame_thread.entropy_progress; + const int p1 = atomic_load(prog); if (p1 < t->sby) goto next; atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR); } @@ -567,6 +655,7 @@ void *dav1d_worker_task(void *data) { ttd->cur++; } if (reset_task_cur(c, ttd, UINT_MAX)) continue; + if (merge_pending(c)) continue; park: tc->task_thread.flushed = 1; pthread_cond_signal(&tc->task_thread.td.cond); @@ -584,6 +673,7 @@ void *dav1d_worker_task(void *data) { if (!t->next) f->task_thread.task_tail = prev_t; if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head) ttd->cur++; + t->next = NULL; // we don't need to check cond_signaled here, since we found a task // after the last signal so we want to re-signal the next waiting thread // and again won't need to signal after that @@ -605,13 +695,13 @@ void *dav1d_worker_task(void *data) { if (res || p1 == TILE_ERROR) { pthread_mutex_lock(&ttd->lock); abort_frame(f, res ? res : DAV1D_ERR(EINVAL)); - } else if (!res) { + reset_task_cur(c, ttd, t->frame_idx); + } else { t->type = DAV1D_TASK_TYPE_INIT_CDF; if (p1) goto found_unlocked; + add_pending(f, t); pthread_mutex_lock(&ttd->lock); - insert_task(f, t, 0); } - reset_task_cur(c, ttd, t->frame_idx); continue; } case DAV1D_TASK_TYPE_INIT_CDF: { @@ -619,7 +709,6 @@ void *dav1d_worker_task(void *data) { int res = DAV1D_ERR(EINVAL); if (!atomic_load(&f->task_thread.error)) res = dav1d_decode_frame_init_cdf(f); - pthread_mutex_lock(&ttd->lock); if (f->frame_hdr->refresh_context && !f->task_thread.update_set) { atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1); } @@ -628,23 +717,34 @@ void *dav1d_worker_task(void *data) { for (int p = 1; p <= 2; p++) { const int res = dav1d_task_create_tile_sbrow(f, p, 0); if (res) { + pthread_mutex_lock(&ttd->lock); // memory allocation failed - f->task_thread.done[2 - p] = 1; + atomic_store(&f->task_thread.done[2 - p], 1); atomic_store(&f->task_thread.error, -1); - f->task_thread.task_counter -= f->sbh + - f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; + atomic_fetch_sub(&f->task_thread.task_counter, + f->frame_hdr->tiling.cols * + f->frame_hdr->tiling.rows + f->sbh); atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR); - if (p == 2 && f->task_thread.done[1]) { - assert(!f->task_thread.task_counter); + if (p == 2 && atomic_load(&f->task_thread.done[1])) { + assert(!atomic_load(&f->task_thread.task_counter)); dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM)); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); + atomic_store(&f->task_thread.init_done, 1); + continue; + } else { + pthread_mutex_unlock(&ttd->lock); } } } - } else abort_frame(f, res); - reset_task_cur(c, ttd, t->frame_idx); - f->task_thread.init_done = 1; + atomic_store(&f->task_thread.init_done, 1); + pthread_mutex_lock(&ttd->lock); + } else { + pthread_mutex_lock(&ttd->lock); + abort_frame(f, res); + reset_task_cur(c, ttd, t->frame_idx); + atomic_store(&f->task_thread.init_done, 1); + } continue; } case DAV1D_TASK_TYPE_TILE_ENTROPY: @@ -673,10 +773,9 @@ void *dav1d_worker_task(void *data) { pthread_cond_signal(&ttd->cond); goto found_unlocked; } - pthread_mutex_lock(&ttd->lock); atomic_store(&ts->progress[p], progress); - reset_task_cur(c, ttd, t->frame_idx); - insert_task(f, t, 0); + add_pending(f, t); + pthread_mutex_lock(&ttd->lock); } else { pthread_mutex_lock(&ttd->lock); atomic_store(&ts->progress[p], progress); @@ -692,15 +791,16 @@ void *dav1d_worker_task(void *data) { if (c->n_fc > 1) atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1); } - if (!--f->task_thread.task_counter && f->task_thread.done[0] && - (!uses_2pass || f->task_thread.done[1])) + if (atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1 == 0 && + atomic_load(&f->task_thread.done[0]) && + (!uses_2pass || atomic_load(&f->task_thread.done[1]))) { dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : error ? DAV1D_ERR(ENOMEM) : 0); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); } - assert(f->task_thread.task_counter >= 0); + assert(atomic_load(&f->task_thread.task_counter) >= 0); if (!atomic_fetch_or(&ttd->cond_signaled, 1)) pthread_cond_signal(&ttd->cond); } @@ -734,15 +834,11 @@ void *dav1d_worker_task(void *data) { if (sby) { int prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]); if (~prog & (1U << ((sby - 1) & 31))) { + t->type = DAV1D_TASK_TYPE_CDEF; + t->recon_progress = t->deblock_progress = 0; + add_pending(f, t); pthread_mutex_lock(&ttd->lock); - prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]); - if (~prog & (1U << ((sby - 1) & 31))) { - t->type = DAV1D_TASK_TYPE_CDEF; - t->recon_progress = t->deblock_progress = 0; - insert_task(f, t, 0); - continue; - } - pthread_mutex_unlock(&ttd->lock); + continue; } } } @@ -776,40 +872,53 @@ void *dav1d_worker_task(void *data) { const int uses_2pass = c->n_fc > 1; const int sbh = f->sbh; const int sbsz = f->sb_step * 4; - const enum PlaneType progress_plane_type = - t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS ? PLANE_TYPE_BLOCK : - c->n_fc > 1 ? PLANE_TYPE_Y : PLANE_TYPE_ALL; - if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS) - atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5], - 1U << (sby & 31)); - pthread_mutex_lock(&ttd->lock); - if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS) { - unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0; - if (frame_prog < FRAME_ERROR) { - int idx = frame_prog >> (f->sb_shift + 7); - int prog; - do { - atomic_uint *state = &f->frame_thread.frame_progress[idx]; - const unsigned val = ~atomic_load(state); - prog = val ? ctz(val) : 32; - if (prog != 32) break; - prog = 0; - } while (++idx < f->frame_thread.prog_sz); - sby = ((idx << 5) | prog) - 1; - } else sby = sbh - 1; + if (t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS) { + error = atomic_load(&f->task_thread.error); + const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz; + assert(c->n_fc > 1); + if (f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */) + atomic_store(&f->sr_cur.progress[0], error ? FRAME_ERROR : y); + atomic_store(&f->frame_thread.entropy_progress, + error ? TILE_ERROR : sby + 1); + if (sby + 1 == sbh) + atomic_store(&f->task_thread.done[1], 1); + pthread_mutex_lock(&ttd->lock); + const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1; + if (sby + 1 < sbh && num_tasks) { + reset_task_cur(c, ttd, t->frame_idx); + continue; + } + if (!num_tasks && atomic_load(&f->task_thread.done[0]) && + atomic_load(&f->task_thread.done[1])) + { + dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : + error ? DAV1D_ERR(ENOMEM) : 0); + f->n_tile_data = 0; + pthread_cond_signal(&f->task_thread.cond); + } + reset_task_cur(c, ttd, t->frame_idx); + continue; } + // t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS + atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5], + 1U << (sby & 31)); + pthread_mutex_lock(&f->task_thread.lock); + sby = get_frame_progress(c, f); error = atomic_load(&f->task_thread.error); const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz; - if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */) { - const int idx = t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS; - atomic_store(&f->sr_cur.progress[idx], error ? FRAME_ERROR : y); - } - if (progress_plane_type == PLANE_TYPE_BLOCK) - f->frame_thread.entropy_progress = error ? TILE_ERROR : sby + 1; + if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */) + atomic_store(&f->sr_cur.progress[1], error ? FRAME_ERROR : y); + pthread_mutex_unlock(&f->task_thread.lock); if (sby + 1 == sbh) - f->task_thread.done[progress_plane_type == PLANE_TYPE_BLOCK] = 1; - if (!--f->task_thread.task_counter && - f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1])) + atomic_store(&f->task_thread.done[0], 1); + pthread_mutex_lock(&ttd->lock); + const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1; + if (sby + 1 < sbh && num_tasks) { + reset_task_cur(c, ttd, t->frame_idx); + continue; + } + if (!num_tasks && atomic_load(&f->task_thread.done[0]) && + (!uses_2pass || atomic_load(&f->task_thread.done[1]))) { dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : error ? DAV1D_ERR(ENOMEM) : 0); diff --git a/chromium/third_party/dav1d/libdav1d/src/wedge.h b/chromium/third_party/dav1d/libdav1d/src/wedge.h index 45f0570a270..586be98c42c 100644 --- a/chromium/third_party/dav1d/libdav1d/src/wedge.h +++ b/chromium/third_party/dav1d/libdav1d/src/wedge.h @@ -31,11 +31,11 @@ #include "src/levels.h" void dav1d_init_wedge_masks(void); -extern const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */] +EXTERN const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */] [2 /* sign */][16 /* wedge_idx */]; void dav1d_init_interintra_masks(void); -extern const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */] +EXTERN const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */] [N_INTER_INTRA_PRED_MODES]; #endif /* DAV1D_SRC_WEDGE_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h b/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h new file mode 100644 index 00000000000..553d6507412 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h @@ -0,0 +1,87 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/cdef.h" + +#define decl_cdef_fns(ext) \ + decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \ + decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \ + decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext)) + +decl_cdef_fns(avx512icl); +decl_cdef_fns(avx2); +decl_cdef_fns(sse4); +decl_cdef_fns(ssse3); +decl_cdef_fns(sse2); + +decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2)); +decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4)); +decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3)); + +static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + +#if BITDEPTH == 8 + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + + c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2); + c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2); + c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->dir = BF(dav1d_cdef_dir, ssse3); + c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3); + c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3); + c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + + c->dir = BF(dav1d_cdef_dir, sse4); +#if BITDEPTH == 8 + c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4); + c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4); + c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4); +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->dir = BF(dav1d_cdef_dir, avx2); + c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2); + c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2); + c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl); + c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl); + c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm new file mode 100644 index 00000000000..6d625a02a0c --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm @@ -0,0 +1,622 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21 + db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29 + db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37 + db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45 +end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 +edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111 + dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011 + dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111 +pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4 +cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6 + dw 1, 2, 1, 10, 9, 18, 8, 17 + dw 8, 16, 8, 15, -7,-14, 1, -6 +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28 + db 2, 4, 2, 36, 34, 68, 32, 66 + db 32, 64, 32, 62,-30,-60, 2,-28 +pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3 +sec_taps4: dw 32, 16 +pw_m16384: times 2 dw -16384 +pw_2048: times 2 dw 2048 +pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4) +edge_mask8: dw 0x2121, 0x2020, 0x0101 + +SECTION .text + +%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp + psubw %1, %2, %3 + pabsw %1, %1 + vpcmpgtw k1, %3, %2 + vpsrlvw %7, %1, %6 + psubusw %7, %5, %7 + pminsw %1, %7 + vpsubw %1{k1}, %4, %1 +%endmacro + +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 +; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7 +; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7 + +INIT_ZMM avx512icl +cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r6-cdef_dirs4 + lea r6, [cdef_dirs4] + movu xm3, [dstq+strideq*0] + vinserti32x4 ym3, [dstq+strideq*1], 1 + mova xm2, [leftq] + lea r2, [dstq+strideq*2] + vinserti32x4 m3, [r2+strideq*0], 2 + mova m5, [base+cdef_perm] + vinserti32x4 m3, [r2+strideq*1], 3 + vpermt2d m2, m5, m3 + vinserti32x4 m1, m2, [topq+strideq*0-4], 0 + vinserti32x4 m1, [topq+strideq*1-4], 1 + mov r3d, edgem + movifnidn prid, prim + punpcklwd m3, m3 ; px + psrlw m5, 8 + vpbroadcastd m0, [base+pd_268435568] + pxor m12, m12 + cmp r3d, 0x0f + jne .mask_edges + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 +.main: + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + vpbroadcastd m15, [base+pri_taps4+priq] + xor prid, prid + add r4d, r3d + cmovns prid, r4d ; pri_shift + mov r4d, dirm + vpbroadcastw m14, prid + mov r5d, secm + vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] + call .constrain + test r5d, r5d + jz .end_no_clip + lzcnt r5d, r5d + vpbroadcastw m13, secm + add r3d, r5d + pminuw m6, m3, m8 + pmaxsw m7, m3, m8 + pminuw m6, m9 + pmaxsw m7, m9 + call .constrain_sec + pminuw m6, m8 + pmaxsw m7, m8 + pminuw m6, m9 + pmaxsw m7, m9 + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain + pminuw m6, m8 + pmaxsw m7, m8 + pminuw m6, m9 + pmaxsw m7, m9 + psrldq m8, m6, 2 + vpshldd m3, m0, 8 + psrldq m9, m7, 2 + paddd m0, m3 + pminuw m6, m8 + psrldq m0, 1 + pmaxsw m7, m9 + pmaxsw m0, m6 + pminsw m0, m7 + vpmovdw ym0, m0 + jmp .end +.sec_only: + tzcnt r5d, secm + mov r3d, dampingm + vpbroadcastw m13, secm + mov r4d, dirm + sub r3d, r5d ; sec_shift + call .constrain_sec + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain +.end_no_clip: + mova ym1, [base+end_perm4] + vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4) + paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + vpermb m0, m1, m0 +.end: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm0, ym0, 1 + movq [r2+strideq*0], xm0 + movhps [r2+strideq*1], xm0 + RET +.mask_edges: + vpbroadcastd m6, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 + kmovw k1, [base+edge_mask4-8+r3*2] + jmp .mask_edges_main +.mask_edges_no_bottom: + kmovw k1, [base+edge_mask4+8+r3*2] +.mask_edges_main: + or r3d, 0x04 + vmovdqa32 m1{k1}, m6 ; edge pixels = -16384 + kmovw k1, [base+edge_mask4-8+r3*2] + vmovdqa32 m2{k1}, m6 + jmp .main +.constrain_sec: + vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] + vpbroadcastw m14, r3d + vpbroadcastd m15, [base+sec_taps4] +.constrain: + paddw m8, m5, m9 + vpermi2w m8, m1, m2 ; k0p0 k1p0 + psubw m9, m5, m9 + vpermi2w m9, m1, m2 ; k0p1 k1p1 + CONSTRAIN m10, m8, m3, m12, m13, m14, m11 + vpdpwssd m0, m10, m15 + CONSTRAIN m10, m9, m3, m12, m13, m14, m11 + vpdpwssd m0, m10, m15 + ret + +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 +; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 +; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 + +cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge + lea r6, [cdef_dirs4] + movu xm18, [dstq+strideq*0] + vinserti128 ym18, [dstq+strideq*1], 1 + mova xm1, [leftq+16*0] + mova xm2, [leftq+16*1] + lea r2, [strideq*3] + vinserti32x4 m18, [dstq+strideq*2], 2 + mova m5, [base+cdef_perm] + vinserti32x4 m18, [dstq+r2 ], 3 + vpermt2d m1, m5, m18 + vinserti32x4 m0, m1, [topq+strideq*0-4], 0 + vinserti32x4 m0, [topq+strideq*1-4], 1 + lea r3, [dstq+strideq*4] + movu xm19, [r3+strideq*0] + vinserti128 ym19, [r3+strideq*1], 1 + vinserti32x4 m19, [r3+strideq*2], 2 + vinserti32x4 m19, [r3+r2 ], 3 + mov r3d, edgem + movifnidn prid, prim + vpermt2d m2, m5, m19 + vpbroadcastd m16, [base+pd_268435568] + pxor m12, m12 + punpcklwd m18, m18 ; px (top) + psrlw m5, 8 + punpcklwd m19, m19 ; px (bottom) + mova m17, m16 + vshufi32x4 m1, m2, q3210 + cmp r3d, 0x0f + jne .mask_edges + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 +.main: + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + vpbroadcastd m15, [base+pri_taps4+priq] + xor prid, prid + add r4d, r3d + cmovns prid, r4d ; pri_shift + mov r4d, dirm + vpbroadcastw m14, prid + mov r5d, secm + vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] + call .constrain + test r5d, r5d + jz .end_no_clip + lzcnt r5d, r5d + vpbroadcastw m13, secm + add r3d, r5d + pminuw m3, m18, m6 + pmaxsw m4, m18, m6 + pminuw m20, m19, m7 + pmaxsw m21, m19, m7 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + call .constrain_sec + pminuw m3, m6 + pmaxsw m4, m6 + pminuw m20, m7 + pmaxsw m21, m7 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain + pminuw m3, m6 + pmaxsw m4, m6 + mov r3, 0xcccccccccccccccc + pminuw m20, m7 + pmaxsw m21, m7 + kmovq k1, r3 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + vbroadcasti32x4 m0, [base+deint_shuf] + vpshldd m6, m20, m3, 16 + vmovdqu8 m3{k1}, m20 + vpshldd m18, m16, 8 + vpshldd m7, m21, m4, 16 + vmovdqu8 m4{k1}, m21 + vpshldd m19, m17, 8 + pminuw m3, m6 + paddd m16, m18 + pmaxsw m4, m7 + paddd m17, m19 + psrldq m16, 1 + palignr m16{k1}, m17, m17, 15 + lea r6, [dstq+strideq*4] + pmaxsw m16, m3 + pminsw m16, m4 + pshufb m16, m0 + movq [dstq+strideq*0], xm16 + movhps [r6 +strideq*0], xm16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*1], xm17 + movhps [r6 +strideq*1], xm17 + vextracti32x4 xm17, m16, 2 + movq [dstq+strideq*2], xm17 + movhps [r6 +strideq*2], xm17 + vextracti32x4 xm16, m16, 3 + movq [dstq+r2 ], xm16 + movhps [r6 +r2 ], xm16 + RET +.sec_only: + mov r4d, dirm + tzcnt r5d, secm + mov r3d, dampingm + vpbroadcastw m13, secm + sub r3d, r5d ; sec_shift + call .constrain_sec + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain +.end_no_clip: + mova ym20, [base+end_perm4] + vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m19, m17, 8 + paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddd m17, m19 + vpermb m16, m20, m16 + vpermb m17, m20, m17 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + vextracti128 xm16, ym16, 1 + movq [dstq+strideq*2], xm16 + movhps [dstq+r2 ], xm16 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm17 + movhps [dstq+strideq*1], xm17 + vextracti128 xm17, ym17, 1 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + RET +.mask_edges: + vpbroadcastd m6, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 + kmovw k1, [base+edge_mask4-8+r3*2] + jmp .mask_edges_main +.mask_edges_no_bottom: + kmovw k1, [base+edge_mask4+8+r3*2] +.mask_edges_main: + mov r4d, r3d + or r3d, 0x0c + vmovdqa32 m0{k1}, m6 ; edge pixels = -16384 + kmovw k1, [base+edge_mask4-8+r3*2] + or r4d, 0x04 + vmovdqa32 m1{k1}, m6 + kmovw k1, [base+edge_mask4-8+r4*2] + vmovdqa32 m2{k1}, m6 + jmp .main +.constrain_sec: + vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] + vpbroadcastw m14, r3d + vpbroadcastd m15, [base+sec_taps4] +.constrain: + paddw m7, m5, m9 + mova m6, m0 + vpermt2w m6, m7, m1 ; k0p0 k1p0 (top) + psubw m9, m5, m9 + mova m8, m0 + vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom) + CONSTRAIN m10, m6, m18, m12, m13, m14, m11 + vpermt2w m8, m9, m1 ; k0p1 k1p1 (top) + vpdpwssd m16, m10, m15 + CONSTRAIN m10, m7, m19, m12, m13, m14, m11 + vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom) + vpdpwssd m17, m10, m15 + CONSTRAIN m10, m8, m18, m12, m13, m14, m11 + vpdpwssd m16, m10, m15 + CONSTRAIN m10, m9, m19, m12, m13, m14, m11 + vpdpwssd m17, m10, m15 + ret + +cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r6-cdef_dirs8 + lea r6, [cdef_dirs8] + movu ym17, [dstq+strideq*0] + vinserti32x8 m17, [dstq+strideq*1], 1 + movq xm4, [leftq+8*0] + movq xm5, [leftq+8*1] + psrld m2, [base+cdef_perm], 16 + movq xm6, [leftq+8*2] + movq xm7, [leftq+8*3] + lea r2, [strideq*3] + movu ym16, [topq+strideq*0-4] + vinserti32x8 m16, [topq+strideq*1-4], 1 + lea r3, [dstq+strideq*4] + movu ym18, [dstq+strideq*2] + vinserti32x8 m18, [dstq+r2 ], 1 + movu ym19, [r3+strideq*0] + vinserti32x8 m19, [r3+strideq*1], 1 + movu ym20, [r3+strideq*2] + vinserti32x8 m20, [r3+r2 ], 1 + vshufi32x4 m0, m17, m18, q2020 ; px (top) + mov r3d, edgem + vshufi32x4 m1, m19, m20, q2020 ; px (bottom) + movifnidn prid, prim + vpermt2d m17, m2, m4 + vpermt2d m18, m2, m5 + pxor m12, m12 + vpermt2d m19, m2, m6 + vpermt2d m20, m2, m7 + cmp r3d, 0x0f + jne .mask_edges + movu ym21, [botq+strideq*0-4] + vinserti32x8 m21, [botq+strideq*1-4], 1 +.main: + mova [rsp+64*0], m16 ; top + mova [rsp+64*1], m17 ; 0 1 + mova [rsp+64*2], m18 ; 2 3 + mova [rsp+64*3], m19 ; 4 5 + mova [rsp+64*4], m20 ; 6 7 + mova [rsp+64*5], m21 ; bottom + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + add r4d, r3d ; pri_shift + vpbroadcastw m14, r4d + mov r4d, dirm + vpbroadcastd m2, [base+pri_taps8+priq*2+0] + vpbroadcastd m3, [base+pri_taps8+priq*2+4] + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1 + pmaxsw m14, m12 + call .constrain + mov r5d, secm + pmullw m16, m8, m2 + pmullw m17, m9, m2 + test r5d, r5d + jnz .pri_sec + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 + call .constrain + pmullw m8, m3 + pmullw m9, m3 + jmp .end_no_clip +.pri_sec: + lzcnt r5d, r5d + add r3d, r5d ; sec_shift + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 + pminuw m18, m0, m4 + pmaxsw m19, m0, m4 + pminuw m20, m1, m5 + pmaxsw m21, m1, m5 + call .min_max_constrain2 + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2 + pmullw m8, m3 + pmullw m9, m3 + vpbroadcastw m13, secm + vpbroadcastw m14, r3d + paddw m16, m8 + paddw m17, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3 + mova m2, m8 + mova m3, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2 + paddw m2, m8 + paddw m3, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3 + paddw m2, m2 + paddw m3, m3 + paddw m16, m8 + paddw m17, m9 + call .min_max_constrain + vpbroadcastd m10, [base+pw_2048] + paddw m16, m2 + paddw m17, m3 + paddw m16, m8 + paddw m17, m9 + psraw m8, m16, 15 + psraw m9, m17, 15 + paddw m16, m8 + paddw m17, m9 + pmulhrsw m16, m10 + pmulhrsw m17, m10 + pminuw m18, m4 + pmaxsw m19, m4 + pminuw m20, m5 + pmaxsw m21, m5 + pminuw m18, m6 + pmaxsw m19, m6 + pminuw m20, m7 + pmaxsw m21, m7 + paddw m16, m0 + paddw m17, m1 + pmaxsw m16, m18 + pmaxsw m17, m20 + pminsw m16, m19 + pminsw m17, m21 + jmp .end +.sec_only: + tzcnt r5d, secm + mov r4d, dirm + mov r3d, dampingm + vpbroadcastw m13, secm + sub r3d, r5d + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] + vpbroadcastw m14, r3d + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] + mova m16, m8 + mova m17, m9 + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] + paddw m16, m8 + paddw m17, m9 + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] + paddw m16, m16 + paddw m17, m17 + paddw m16, m8 + paddw m17, m9 + call .constrain +.end_no_clip: + vpbroadcastd m10, [base+pw_2048] + paddw m16, m8 + paddw m17, m9 + psraw m8, m16, 15 + psraw m9, m17, 15 + paddw m16, m8 + paddw m17, m9 + pmulhrsw m16, m10 + pmulhrsw m17, m10 + paddw m16, m0 + paddw m17, m1 +.end: + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm17 + vextracti128 [dstq+strideq*1], ym17, 1 + vextracti32x4 [dstq+strideq*2], m17, 2 + vextracti32x4 [dstq+r2 ], m17, 3 + RET +.mask_edges: + vpbroadcastd m2, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + movu ym21, [botq+strideq*0-4] + vinserti32x8 m21, [botq+strideq*1-4], 1 + jmp .mask_edges_top +.mask_edges_no_bottom: + mova m21, m2 +.mask_edges_top: + test r3b, 0x04 + jnz .mask_edges_main + mova m16, m2 +.mask_edges_main: + and r3d, 0x03 + cmp r3d, 0x03 + je .main + kmovw k1, [base+edge_mask8+r3*2] + vmovdqa32 m16{k1}, m2 ; edge pixels = -16384 + vmovdqa32 m17{k1}, m2 + vmovdqa32 m18{k1}, m2 + vmovdqa32 m19{k1}, m2 + vmovdqa32 m20{k1}, m2 + vmovdqa32 m21{k1}, m2 + jmp .main +ALIGN function_align +.min_max_constrain: + pminuw m18, m4 + pmaxsw m19, m4 + pminuw m20, m5 + pmaxsw m21, m5 +.min_max_constrain2: + pminuw m18, m6 + pmaxsw m19, m6 + pminuw m20, m7 + pmaxsw m21, m7 +.constrain: + %define tmp rsp+gprsize+68 + movu m4, [tmp+r5+64*0] + vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top) + movu m5, [tmp+r5+64*2] + vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom) + neg r5 + movu m6, [tmp+r5+64*0] + vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top) + movu m7, [tmp+r5+64*2] + vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom) + CONSTRAIN m8, m4, m0, m12, m13, m14, m15 + CONSTRAIN m9, m5, m1, m12, m13, m14, m15 + CONSTRAIN m10, m6, m0, m12, m13, m14, m15 + CONSTRAIN m11, m7, m1, m12, m13, m14, m15 + paddw m8, m10 + paddw m9, m11 + ret + +%endif ; ARCH_X86_64 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h new file mode 100644 index 00000000000..eeaa328d1e1 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h @@ -0,0 +1,81 @@ +/* + * Copyright © 2018-2022, VideoLAN and dav1d authors + * Copyright © 2018-2022, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/filmgrain.h" + +#define decl_fg_fns(ext) \ +decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \ +decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext)) + +decl_fg_fns(ssse3); +decl_fg_fns(avx2); +decl_fg_fns(avx512icl); + +static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3); + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) { + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); + } + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h new file mode 100644 index 00000000000..7df563fee1c --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h @@ -0,0 +1,146 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/ipred.h" + +#define decl_fn(type, name) \ + decl_##type##_fn(BF(dav1d_##name, ssse3)); \ + decl_##type##_fn(BF(dav1d_##name, avx2)); \ + decl_##type##_fn(BF(dav1d_##name, avx512icl)) +#define init_fn(type0, type1, name, suffix) \ + c->type0[type1] = BF(dav1d_##name, suffix) + +#define init_angular_ipred_fn(type, name, suffix) \ + init_fn(intra_pred, type, name, suffix) +#define init_cfl_pred_fn(type, name, suffix) \ + init_fn(cfl_pred, type, name, suffix) +#define init_cfl_ac_fn(type, name, suffix) \ + init_fn(cfl_ac, type, name, suffix) + +decl_fn(angular_ipred, ipred_dc); +decl_fn(angular_ipred, ipred_dc_128); +decl_fn(angular_ipred, ipred_dc_top); +decl_fn(angular_ipred, ipred_dc_left); +decl_fn(angular_ipred, ipred_h); +decl_fn(angular_ipred, ipred_v); +decl_fn(angular_ipred, ipred_paeth); +decl_fn(angular_ipred, ipred_smooth); +decl_fn(angular_ipred, ipred_smooth_h); +decl_fn(angular_ipred, ipred_smooth_v); +decl_fn(angular_ipred, ipred_z1); +decl_fn(angular_ipred, ipred_z2); +decl_fn(angular_ipred, ipred_z3); +decl_fn(angular_ipred, ipred_filter); + +decl_fn(cfl_pred, ipred_cfl); +decl_fn(cfl_pred, ipred_cfl_128); +decl_fn(cfl_pred, ipred_cfl_top); +decl_fn(cfl_pred, ipred_cfl_left); + +decl_fn(cfl_ac, ipred_cfl_ac_420); +decl_fn(cfl_ac, ipred_cfl_ac_422); +decl_fn(cfl_ac, ipred_cfl_ac_444); + +decl_fn(pal_pred, pal_pred); + +static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3); + init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3); + init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3); + + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3); + + c->pal_pred = BF(dav1d_pal_pred, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + init_angular_ipred_fn(DC_PRED, ipred_dc, avx2); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2); + init_angular_ipred_fn(HOR_PRED, ipred_h, avx2); + init_angular_ipred_fn(VERT_PRED, ipred_v, avx2); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2); + init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2); + init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2); + init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2); + + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2); + + c->pal_pred = BF(dav1d_pal_pred, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if BITDEPTH == 8 + init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl); + init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); + init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); +#endif + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl); + + c->pal_pred = BF(dav1d_pal_pred, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm index 4a1b060bd5f..1a307adc985 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm @@ -114,20 +114,20 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h vbroadcasti32x4 m2, [tlq] pshufb m2, m7 ; left PAETH 4, 5, 6 - vextracti32x4 xmm1, m0, 2 - vextracti32x4 xmm2, ym0, 1 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm8, ym0, 1 + vextracti32x4 xm9, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+r6 ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm8 + movq [dstq+r6 ], xm9 sub hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+r6 ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm8 + movhps [dstq+r6 ], xm9 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: @@ -220,19 +220,19 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 pshufb m3, m4 pmulhrsw m3, m5 paddw m3, m6 - vextracti32x4 xmm0, m3, 3 - vextracti32x4 xmm1, ym3, 1 - vextracti32x4 xmm2, m3, 2 - movhps [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 + vextracti32x4 xm0, m3, 3 + vextracti32x4 xm1, ym3, 1 + vextracti32x4 xm2, m3, 2 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 add hq, 8 jg .end lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jl .w4_loop @@ -337,20 +337,20 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 psubw m0, m6 ; left - right pmulhrsw m0, m5 paddw m0, m6 - vextracti32x4 xmm1, m0, 2 - vextracti32x4 xmm2, ym0, 1 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 sub hd, 8*2 jl .end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w4_loop .end: @@ -472,11 +472,11 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 vpdpwssd m0, m1, m6 vpermb m0, m14, m0 pavgw ym0, ym15 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add v_weightsq, 4*4 sub hd, 4*2 @@ -624,11 +624,11 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 pmovzxbw ym0, [idxq] add idxq, 16 vpermw ym0, ym0, ym3 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - movq [dstq+strideq*2], xmm1 - movhps [dstq+stride3q ], xmm1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm index 050ec9bb253..38c86b54f5c 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm @@ -242,9 +242,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 jmp wq .w8: movq xmm1, [tlq+1] - vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 - paddd xmm2, xm0 + paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 @@ -275,9 +275,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 jmp wq .w16: movu xmm1, [tlq+1] - vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 - paddd xmm2, xm0 + paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 @@ -309,8 +309,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 .w32: movu ym1, [tlq+1] vpdpbusd ym0, ym1, ym3 - vextracti32x4 xmm1, ym0, 1 - paddd xmm1, xm0 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 @@ -345,8 +345,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 movu ym2, [tlq+33] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 - vextracti32x4 xmm1, ym0, 1 - paddd xmm1, xm0 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 @@ -524,12 +524,12 @@ INIT_YMM avx512icl pextrd [dstq+stride3q ], xm0, 3 sub hd, 8 jl .w4_ret - vextracti32x4 xmm0, m0, 1 + vextracti32x4 xm0, m0, 1 lea dstq, [dstq+strideq*4] - movd [dstq+strideq*0], xmm0 - pextrd [dstq+strideq*1], xmm0, 1 - pextrd [dstq+strideq*2], xmm0, 2 - pextrd [dstq+stride3q ], xmm0, 3 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm0, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_ret: @@ -545,20 +545,20 @@ INIT_ZMM avx512icl vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 PAETH - vextracti32x4 xmm1, m0, 2 - vextracti32x4 xmm2, ym0, 1 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 sub hd, 8 jl .w8_ret lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w8_loop .w8_ret: @@ -639,18 +639,18 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xmm1 + movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+stride3q ], xmm1, 2 + pextrd [dstq+stride3q ], xm1, 2 add hq, 8 jg .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jl .w4_loop .ret: @@ -669,11 +669,11 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop @@ -785,18 +785,18 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xmm1 + movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+stride3q ], xmm1, 2 + pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: @@ -815,11 +815,11 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop @@ -937,18 +937,18 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xmm1 + movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+stride3q ], xmm1, 2 + pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: @@ -978,11 +978,11 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx.h b/chromium/third_party/dav1d/libdav1d/src/x86/itx.h new file mode 100644 index 00000000000..46cfdb75d1d --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx.h @@ -0,0 +1,356 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix + +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + +#define decl_itx_fns(ext) \ +decl_itx17_fns( 4, 4, ext); \ +decl_itx16_fns( 4, 8, ext); \ +decl_itx16_fns( 4, 16, ext); \ +decl_itx16_fns( 8, 4, ext); \ +decl_itx16_fns( 8, 8, ext); \ +decl_itx16_fns( 8, 16, ext); \ +decl_itx2_fns ( 8, 32, ext); \ +decl_itx16_fns(16, 4, ext); \ +decl_itx16_fns(16, 8, ext); \ +decl_itx12_fns(16, 16, ext); \ +decl_itx2_fns (16, 32, ext); \ +decl_itx2_fns (32, 8, ext); \ +decl_itx2_fns (32, 16, ext); \ +decl_itx2_fns (32, 32, ext); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext)) + + +#define decl_itx2_bpc_fns(w, h, bpc, opt) \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt)) + +#define decl_itx12_bpc_fns(w, h, bpc, opt) \ +decl_itx2_bpc_fns(w, h, bpc, opt); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt)) + +#define decl_itx16_bpc_fns(w, h, bpc, opt) \ +decl_itx12_bpc_fns(w, h, bpc, opt); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt)) + +#define decl_itx_bpc_fns(bpc, ext) \ +decl_itx16_bpc_fns( 4, 4, bpc, ext); \ +decl_itx16_bpc_fns( 4, 8, bpc, ext); \ +decl_itx16_bpc_fns( 4, 16, bpc, ext); \ +decl_itx16_bpc_fns( 8, 4, bpc, ext); \ +decl_itx16_bpc_fns( 8, 8, bpc, ext); \ +decl_itx16_bpc_fns( 8, 16, bpc, ext); \ +decl_itx2_bpc_fns ( 8, 32, bpc, ext); \ +decl_itx16_bpc_fns(16, 4, bpc, ext); \ +decl_itx16_bpc_fns(16, 8, bpc, ext); \ +decl_itx12_bpc_fns(16, 16, bpc, ext); \ +decl_itx2_bpc_fns (16, 32, bpc, ext); \ +decl_itx2_bpc_fns (32, 8, bpc, ext); \ +decl_itx2_bpc_fns (32, 16, bpc, ext); \ +decl_itx2_bpc_fns (32, 32, bpc, ext); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext)) + +decl_itx_fns(avx512icl); +decl_itx_bpc_fns(10, avx512icl); +decl_itx_fns(avx2); +decl_itx_bpc_fns(10, avx2); +decl_itx_bpc_fns(12, avx2); +decl_itx_fns(sse4); +decl_itx_fns(ssse3); +decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); + +static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) { +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + + +#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext) + +#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext) + +#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext) + +#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext) + +#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext) + + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + +#if BITDEPTH == 8 + assign_itx16_fn(, 4, 4, ssse3); + assign_itx16_fn(R, 4, 8, ssse3); + assign_itx16_fn(R, 8, 4, ssse3); + assign_itx16_fn(, 8, 8, ssse3); + assign_itx16_fn(R, 4, 16, ssse3); + assign_itx16_fn(R, 16, 4, ssse3); + assign_itx16_fn(R, 8, 16, ssse3); + assign_itx16_fn(R, 16, 8, ssse3); + assign_itx12_fn(, 16, 16, ssse3); + assign_itx2_fn (R, 8, 32, ssse3); + assign_itx2_fn (R, 32, 8, ssse3); + assign_itx2_fn (R, 16, 32, ssse3); + assign_itx2_fn (R, 32, 16, ssse3); + assign_itx2_fn (, 32, 32, ssse3); + assign_itx1_fn (R, 16, 64, ssse3); + assign_itx1_fn (R, 32, 64, ssse3); + assign_itx1_fn (R, 64, 16, ssse3); + assign_itx1_fn (R, 64, 32, ssse3); + assign_itx1_fn ( , 64, 64, ssse3); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + +#if BITDEPTH == 16 + if (bpc == 10) { + assign_itx16_fn(, 4, 4, sse4); + assign_itx16_fn(R, 4, 8, sse4); + assign_itx16_fn(R, 4, 16, sse4); + assign_itx16_fn(R, 8, 4, sse4); + assign_itx16_fn(, 8, 8, sse4); + assign_itx16_fn(R, 8, 16, sse4); + assign_itx16_fn(R, 16, 4, sse4); + assign_itx16_fn(R, 16, 8, sse4); + assign_itx12_fn(, 16, 16, sse4); + assign_itx2_fn (R, 8, 32, sse4); + assign_itx2_fn (R, 32, 8, sse4); + assign_itx2_fn (R, 16, 32, sse4); + assign_itx2_fn (R, 32, 16, sse4); + assign_itx2_fn (, 32, 32, sse4); + assign_itx1_fn (R, 16, 64, sse4); + assign_itx1_fn (R, 32, 64, sse4); + assign_itx1_fn (R, 64, 16, sse4); + assign_itx1_fn (R, 64, 32, sse4); + assign_itx1_fn (, 64, 64, sse4); + } +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2); + +#if BITDEPTH == 8 + assign_itx16_fn( , 4, 4, avx2); + assign_itx16_fn(R, 4, 8, avx2); + assign_itx16_fn(R, 4, 16, avx2); + assign_itx16_fn(R, 8, 4, avx2); + assign_itx16_fn( , 8, 8, avx2); + assign_itx16_fn(R, 8, 16, avx2); + assign_itx2_fn (R, 8, 32, avx2); + assign_itx16_fn(R, 16, 4, avx2); + assign_itx16_fn(R, 16, 8, avx2); + assign_itx12_fn( , 16, 16, avx2); + assign_itx2_fn (R, 16, 32, avx2); + assign_itx1_fn (R, 16, 64, avx2); + assign_itx2_fn (R, 32, 8, avx2); + assign_itx2_fn (R, 32, 16, avx2); + assign_itx2_fn ( , 32, 32, avx2); + assign_itx1_fn (R, 32, 64, avx2); + assign_itx1_fn (R, 64, 16, avx2); + assign_itx1_fn (R, 64, 32, avx2); + assign_itx1_fn ( , 64, 64, avx2); +#else + if (bpc == 10) { + assign_itx16_bpc_fn( , 4, 4, 10, avx2); + assign_itx16_bpc_fn(R, 4, 8, 10, avx2); + assign_itx16_bpc_fn(R, 4, 16, 10, avx2); + assign_itx16_bpc_fn(R, 8, 4, 10, avx2); + assign_itx16_bpc_fn( , 8, 8, 10, avx2); + assign_itx16_bpc_fn(R, 8, 16, 10, avx2); + assign_itx2_bpc_fn (R, 8, 32, 10, avx2); + assign_itx16_bpc_fn(R, 16, 4, 10, avx2); + assign_itx16_bpc_fn(R, 16, 8, 10, avx2); + assign_itx12_bpc_fn( , 16, 16, 10, avx2); + assign_itx2_bpc_fn (R, 16, 32, 10, avx2); + assign_itx1_bpc_fn (R, 16, 64, 10, avx2); + assign_itx2_bpc_fn (R, 32, 8, 10, avx2); + assign_itx2_bpc_fn (R, 32, 16, 10, avx2); + assign_itx2_bpc_fn ( , 32, 32, 10, avx2); + assign_itx1_bpc_fn (R, 32, 64, 10, avx2); + assign_itx1_bpc_fn (R, 64, 16, 10, avx2); + assign_itx1_bpc_fn (R, 64, 32, 10, avx2); + assign_itx1_bpc_fn ( , 64, 64, 10, avx2); + } else { + assign_itx16_bpc_fn( , 4, 4, 12, avx2); + assign_itx16_bpc_fn(R, 4, 8, 12, avx2); + assign_itx16_bpc_fn(R, 4, 16, 12, avx2); + assign_itx16_bpc_fn(R, 8, 4, 12, avx2); + assign_itx16_bpc_fn( , 8, 8, 12, avx2); + assign_itx16_bpc_fn(R, 8, 16, 12, avx2); + assign_itx2_bpc_fn (R, 8, 32, 12, avx2); + assign_itx16_bpc_fn(R, 16, 4, 12, avx2); + assign_itx16_bpc_fn(R, 16, 8, 12, avx2); + assign_itx12_bpc_fn( , 16, 16, 12, avx2); + assign_itx2_bpc_fn (R, 32, 8, 12, avx2); + } +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if BITDEPTH == 8 + assign_itx16_fn( , 4, 4, avx512icl); // no wht + assign_itx16_fn(R, 4, 8, avx512icl); + assign_itx16_fn(R, 4, 16, avx512icl); + assign_itx16_fn(R, 8, 4, avx512icl); + assign_itx16_fn( , 8, 8, avx512icl); + assign_itx16_fn(R, 8, 16, avx512icl); + assign_itx2_fn (R, 8, 32, avx512icl); + assign_itx16_fn(R, 16, 4, avx512icl); + assign_itx16_fn(R, 16, 8, avx512icl); + assign_itx12_fn( , 16, 16, avx512icl); + assign_itx2_fn (R, 16, 32, avx512icl); + assign_itx1_fn (R, 16, 64, avx512icl); + assign_itx2_fn (R, 32, 8, avx512icl); + assign_itx2_fn (R, 32, 16, avx512icl); + assign_itx2_fn ( , 32, 32, avx512icl); + assign_itx1_fn (R, 32, 64, avx512icl); + assign_itx1_fn (R, 64, 16, avx512icl); + assign_itx1_fn (R, 64, 32, avx512icl); + assign_itx1_fn ( , 64, 64, avx512icl); +#else + if (bpc == 10) { + assign_itx16_bpc_fn( , 8, 8, 10, avx512icl); + assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl); + assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl); + assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl); + assign_itx12_bpc_fn( , 16, 16, 10, avx512icl); + assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl); + } +#endif +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm index c580944c7bb..811f711540f 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm @@ -30,7 +30,6 @@ %if ARCH_X86_64 SECTION_RODATA 32 -pd_1321_2482: dd 1321, 1321, 1321, 1321, 2482, 2482, 2482, 2482 itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 @@ -39,14 +38,17 @@ iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 -iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 -%macro COEF_PAIR 2 +%macro COEF_PAIR 2-3 0 pd_%1_%2: dd %1, %1, %2, %2 %define pd_%1 (pd_%1_%2 + 4*0) %define pd_%2 (pd_%1_%2 + 4*2) +%if %3 +dd -%2, -%2 +%define pd_%2_m%2 pd_%2 +%endif %endmacro COEF_PAIR 201, 995 @@ -56,8 +58,8 @@ COEF_PAIR 1380, 601 COEF_PAIR 1751, 2440 COEF_PAIR 2598, 1189 COEF_PAIR 2751, 2106 -COEF_PAIR 2896, 1567 -COEF_PAIR 2896, 3784 +COEF_PAIR 2896, 1567, 1 +COEF_PAIR 2896, 3784, 1 COEF_PAIR 3035, 3513 COEF_PAIR 3166, 3920 COEF_PAIR 3703, 3290 @@ -66,9 +68,6 @@ COEF_PAIR 4017, 2276 COEF_PAIR 4076, 3612 COEF_PAIR 4091, 3973 -%define pd_1321 (pd_1321_2482 + 4*0) -%define pd_2482 (pd_1321_2482 + 4*4) - pd_8: dd 8 pd_m601: dd -601 pd_m1189: dd -1189 @@ -77,17 +76,23 @@ pd_m2106: dd -2106 pd_m2598: dd -2598 pd_m2751: dd -2751 pd_m3344: dd -3344 +pd_1024: dd 1024 +pd_1321: dd 1321 +pd_1448: dd 1448 +pd_1697: dd 1697 +pd_2482: dd 2482 +pd_3072: dd 3072 ; 1024 + 2048 pd_3803: dd 3803 +pd_5119: dd 5119 ; 1024 + 4096 - 1 +pd_5120: dd 5120 ; 1024 + 4096 pd_5793: dd 5793 pd_6144: dd 6144 ; 2048 + 4096 -pd_10239: dd 10239 ; 2048 + 8192 - 1 -pd_10240: dd 10240 ; 2048 + 8192 -pd_11586: dd 11586 ; 5793 * 2 -pd_34816: dd 34816 ; 2048 + 32768 -pd_38912: dd 38912 ; 2048 + 4096 + 32768 +pd_17408: dd 17408 ; 1024 + 16384 pixel_10bpc_max: times 2 dw 0x03ff pixel_12bpc_max: times 2 dw 0x0fff +dconly_10bpc: times 2 dw 0x7c00 +dconly_12bpc: times 2 dw 0x7000 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff clip_20b_min: dd -0x80000 @@ -214,7 +219,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 -; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2 +; flags: 1 = packed, 2 = inv_dst2 ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags %if %8 < 32 @@ -241,7 +246,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax pmulld m%1, m%5 pmulld m%2, m%5 %endif -%if %9 & 4 +%if %9 & 2 psubd m%4, m%6, m%4 psubd m%2, m%4, m%2 %else @@ -250,17 +255,10 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax %endif paddd m%2, m%4 %endif -%if %9 & 2 ; invert the upper half of dst1 before rounding - vbroadcasti128 m%4, [pw_2048_m2048] - psubd m%1, m%3 - psignd m%1, m%4 - paddd m%1, m%6 -%else %ifnum %6 paddd m%1, m%6 %endif psubd m%1, m%3 -%endif %ifnum %6 psrad m%2, 12 psrad m%1, 12 @@ -287,37 +285,39 @@ ALIGN function_align %endif %endmacro -%macro INV_TXFM_4X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x4 -%ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 - movd xm1, [pw_2896x8] - mov [cq], eobd ; 0 - add r6d, 2048 - sar r6d, 12 - movd xm0, r6d - packssdw xm0, xm0 - pmulhrsw xm0, xm1 - vpbroadcastw xm0, xm0 - mova xm1, xm0 - jmp m(iadst_4x4_internal_10bpc).end -%endif -%endmacro - -%macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x4, 12 +%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x4, %3 %ifidn %1_%2, dct_dct + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 + or r3d, 4 +.dconly2: add r6d, 128 sar r6d, 8 +.dconly3: imul r6d, 181 - add r6d, 128 - sar r6d, 8 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d - vpbroadcastd m0, xm0 - mova m1, m0 - jmp m(iadst_4x4_internal_12bpc).end + paddsw xm0, xm2 + vpbroadcastw xm0, xm0 +.dconly_loop: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + paddsw xm1, xm0 + psubusw xm1, xm2 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + WRAP_XMM RET +%else + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly +%endif %endif %endmacro @@ -399,12 +399,50 @@ INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity +%macro IADST4_1D 0 + vpbroadcastd m5, [pd_1321] + vpbroadcastd m7, [pd_2482] + pmulld m4, m0, m5 ; 1321*in0 + pmulld m6, m3, m7 ; 2482*in3 + paddd m4, m6 ; 1321*in0 + 2482*in3 + pmulld m6, m0, m7 ; 2482*in0 + paddd m0, m3 ; in0 + in3 + paddd m7, m5 ; pd_3803 + pmulld m5, m2 ; 1321*in2 + pmulld m3, m7 ; 3803*in3 + pmulld m7, m2 ; 3803*in2 + psubd m2, m0 ; in2 - in0 - in3 + vpbroadcastd m0, [pd_m3344] + pmulld m1, m0 ; -t3 + pmulld m2, m0 ; out2 (unrounded) + psubd m6, m5 ; 2482*in0 - 1321*in2 + paddd m4, m7 ; t0 + psubd m6, m3 ; t1 + paddd m3, m4, m6 + psubd m4, m1 ; out0 (unrounded) + psubd m6, m1 ; out1 (unrounded) + paddd m3, m1 ; out3 (unrounded) +%endmacro + cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call .main + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass1_end: + vpbroadcastd m5, [pd_2048] + mova m2, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 packssdw m0, m1 - vpermd m0, m4, m0 - psrld m4, 4 - pshufb m0, m4 + vpermd m0, m2, m0 + psrld m2, 4 + pshufb m0, m2 +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif jmp tx2q .pass2: lea r6, [deint_shuf+128] @@ -436,35 +474,16 @@ cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 RET ALIGN function_align .main: - mova m2, [cq+16*2] - vbroadcasti128 m5, [cq+16*0] + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] +%if WIN64 + movaps [rsp+16], xmm6 + movaps [rsp+32], xmm7 +%endif .main2: - mova m0, [pd_1321_2482] - vpbroadcastd m3, [pd_3803] - vpbroadcastd m1, [pd_m3344] - pmulld m4, m0, m2 - pmulld m3, m2 - pmulld m0, m5 - vpbroadcastd m5, [pd_2048] - psubd xm2, [cq+16*3] - psubd m2, [cq+16*0] - pmulld m2, m1 ; t2 t3 - vpermq m4, m4, q1032 - paddd m4, m3 - psubd m0, m4 - paddd xm4, xm4 - paddd m4, m0 ; t0 t1 - vinserti128 m3, m2, xm4, 1 ; t2 t0 - paddd m0, m4, m5 - psubd xm4, xm2 - psubd m1, m0, m2 - vpermq m2, m2, q3232 ; t3 t3 - psubd m1, m4 - mova m4, [itx4_shuf] - paddd m0, m2 ; out0 out1 - paddd m1, m3 ; out2 out3 - psrad m0, 12 - psrad m1, 12 + WRAP_XMM IADST4_1D ret INV_TXFM_4X4_FN flipadst, dct @@ -474,12 +493,9 @@ INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main - packssdw m0, m1 - psrld m1, m4, 8 - vpermd m0, m1, m0 - psrld m4, 4 - pshufb m0, m4 - jmp tx2q + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_10bpc).pass1_end .pass2: lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 @@ -556,19 +572,20 @@ cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 movhps [r6 +strideq*1], xm1 RET -INV_TXFM_4X4_12BPC_FN dct, dct -INV_TXFM_4X4_12BPC_FN dct, identity -INV_TXFM_4X4_12BPC_FN dct, adst -INV_TXFM_4X4_12BPC_FN dct, flipadst +INV_TXFM_4X4_FN dct, dct, 12 +INV_TXFM_4X4_FN dct, identity, 12 +INV_TXFM_4X4_FN dct, adst, 12 +INV_TXFM_4X4_FN dct, flipadst, 12 -cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(idct_4x4_internal_10bpc).main mova m3, [idct4_12_shuf] mova m4, [idct4_12_shuf2] - vpermd m2, m3, m0 - vpermd m1, m4, m1 - jmp m(iadst_4x4_internal_12bpc).pass1_end + vpermd m2, m4, m1 + vpermd m1, m3, m0 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: + vpbroadcastd m5, [pd_2048] vpermq m0, m0, q3120 vpermq m1, m1, q3120 call m(idct_4x4_internal_10bpc).main2 @@ -576,33 +593,52 @@ cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 vpermq m1, m1, q2031 jmp m(iadst_4x4_internal_12bpc).end -INV_TXFM_4X4_12BPC_FN adst, dct -INV_TXFM_4X4_12BPC_FN adst, adst -INV_TXFM_4X4_12BPC_FN adst, flipadst -INV_TXFM_4X4_12BPC_FN adst, identity +INV_TXFM_4X4_FN adst, dct, 12 +INV_TXFM_4X4_FN adst, adst, 12 +INV_TXFM_4X4_FN adst, flipadst, 12 +INV_TXFM_4X4_FN adst, identity, 12 -cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main - vpermd m2, m4, m0 - vpermd m1, m4, m1 + vinserti128 m1, m4, xm6, 1 + vinserti128 m2, xm3, 1 .pass1_end: - punpcklqdq m0, m2, m1 - punpckhqdq m1, m2, m1 + mova m3, [itx4_shuf] + vpbroadcastd m5, [pd_1024] + psrad m1, 1 + psrad m2, 1 + vpermd m1, m3, m1 + vpermd m2, m3, m2 + paddd m1, m5 + paddd m2, m5 + psrad m1, 11 + psrad m2, 11 .pass1_end2: vpbroadcastd m3, [clip_18b_min] vpbroadcastd m4, [clip_18b_max] + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 pmaxsd m0, m3 pmaxsd m1, m3 pminsd m0, m4 pminsd m1, m4 jmp tx2q .pass2: - mova [cq+16*0], m0 - vextracti128 [cq+16*3], m1, 1 - mova m2, m1 - vpermq m5, m0, q1010 - call m(iadst_4x4_internal_10bpc).main2 + call .main_pass2 + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass2_end: + vpbroadcastd m5, [pd_2048] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 .end: +%if WIN64 + WIN64_RESTORE_XMM_INTERNAL + %assign xmm_regs_used 6 +%endif +.end2: vpbroadcastd m4, [pw_16384] movq xm2, [dstq+strideq*0] movq xm3, [dstq+strideq*1] @@ -627,53 +663,53 @@ cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 movhps [r6 +strideq*0], xm0 movhps [r6 +strideq*1], xm1 RET +.main_pass2: + vextracti128 xm3, m1, 1 + mova xm2, xm1 + vextracti128 xm1, m0, 1 + jmp m(iadst_4x4_internal_10bpc).main2 -INV_TXFM_4X4_12BPC_FN flipadst, dct -INV_TXFM_4X4_12BPC_FN flipadst, adst -INV_TXFM_4X4_12BPC_FN flipadst, flipadst -INV_TXFM_4X4_12BPC_FN flipadst, identity +INV_TXFM_4X4_FN flipadst, dct, 12 +INV_TXFM_4X4_FN flipadst, adst, 12 +INV_TXFM_4X4_FN flipadst, flipadst, 12 +INV_TXFM_4X4_FN flipadst, identity, 12 -cglobal iflipadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main - psrld m4, 8 - vpermd m2, m4, m0 - vpermd m1, m4, m1 - punpckhqdq m0, m1, m2 - punpcklqdq m1, m2 - jmp m(iadst_4x4_internal_12bpc).pass1_end2 + vinserti128 m1, m3, xm2, 1 + vinserti128 m2, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass1_end .pass2: - mova [cq+16*0], m0 - vextracti128 [cq+16*3], m1, 1 - mova m2, m1 - vpermq m5, m0, q1010 - call m(iadst_4x4_internal_10bpc).main2 - vpermq m2, m0, q1032 - vpermq m0, m1, q1032 - mova m1, m2 - jmp m(iadst_4x4_internal_12bpc).end - -INV_TXFM_4X4_12BPC_FN identity, dct -INV_TXFM_4X4_12BPC_FN identity, adst -INV_TXFM_4X4_12BPC_FN identity, flipadst -INV_TXFM_4X4_12BPC_FN identity, identity - -cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 - vpbroadcastd m1, [pd_5793] - pmulld m0, m1, [cq+32*0] - pmulld m1, [cq+32*1] + call m(iadst_4x4_internal_12bpc).main_pass2 + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass2_end + +INV_TXFM_4X4_FN identity, dct, 12 +INV_TXFM_4X4_FN identity, adst, 12 +INV_TXFM_4X4_FN identity, flipadst, 12 +INV_TXFM_4X4_FN identity, identity, 12 + +cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + mova m2, [itx4_shuf] + vpbroadcastd m3, [pd_1697] + vpermd m0, m2, [cq+32*0] + vpermd m2, m2, [cq+32*1] vpbroadcastd m5, [pd_2048] - mova m3, [itx4_shuf] - paddd m0, m5 + pmulld m1, m3, m0 + pmulld m3, m2 paddd m1, m5 - psrad m0, 12 + paddd m3, m5 psrad m1, 12 - vpermd m2, m3, m0 - vpermd m1, m3, m1 - jmp m(iadst_4x4_internal_12bpc).pass1_end + psrad m3, 12 + paddd m1, m0 + paddd m2, m3 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: ; m0 = in0 in1 ; m1 = in2 in3 vpbroadcastd m3, [pd_5793] + vpbroadcastd m5, [pd_2048] pmulld m0, m3 pmulld m1, m3 paddd m0, m5 ; 2048 @@ -685,34 +721,19 @@ cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 %macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x8, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 8 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 2048 - sar r6d, 12 -.end: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 - movd xm0, r6d - vpbroadcastw xm0, xm0 - vpbroadcastd xm3, [pixel_%3bpc_max] - pxor xm2, xm2 -.end_loop: - movq xm1, [dstq+strideq*0] - movhps xm1, [dstq+strideq*1] - paddw xm1, xm0 - pmaxsw xm1, xm2 - pminsw xm1, xm3 - movq [dstq+strideq*0], xm1 - movhps [dstq+strideq*1], xm1 - lea dstq, [dstq+strideq*2] - sub r3d, 2 - jg .end_loop - WRAP_XMM RET + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2 +%else + jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly +%endif %endif %endmacro @@ -797,12 +818,14 @@ INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 -.pass1: call m(iadst_8x4_internal_10bpc).main - psrad m0, m4, 12 - psrad m1, m5, 12 - psrad m2, 12 - psrad m3, 12 + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass1_end: + REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: call .pass2_main @@ -918,13 +941,13 @@ INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 -.pass1: call m(iadst_8x4_internal_10bpc).main - psrad m0, m3, 12 - psrad m1, m2, 12 - psrad m2, m5, 12 - psrad m3, m4, 12 - jmp tx2q + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m2, m5, m6 + paddd m3, m5, m4 + jmp m(iadst_4x8_internal_10bpc).pass1_end .pass2: call m(iadst_4x8_internal_10bpc).pass2_main mova xm4, [pw_2048_m2048] @@ -1070,7 +1093,16 @@ INV_TXFM_4X8_FN adst, flipadst, 12 INV_TXFM_4X8_FN adst, identity, 12 cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 - jmp m(iadst_4x8_internal_10bpc).pass1 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m4, 1 + psrad m1, m6, 1 + psrad m2, 1 + psrad m3, 1 +.pass1_end: + vpbroadcastd m5, [pd_1024] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 11}, m0, m1, m2, m3 + jmp tx2q .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] @@ -1146,7 +1178,12 @@ INV_TXFM_4X8_FN flipadst, flipadst, 12 INV_TXFM_4X8_FN flipadst, identity, 12 cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 - jmp m(iflipadst_4x8_internal_10bpc).pass1 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m3, 1 + psrad m1, m2, 1 + psrad m2, m6, 1 + psrad m3, m4, 1 + jmp m(iadst_4x8_internal_12bpc).pass1_end .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] @@ -1180,12 +1217,13 @@ cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 %macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x16, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd xm2, [dconly_%3bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 6144 - sar r6d, 13 - jmp m(inv_txfm_add_dct_dct_4x8_%3bpc).end + or r3d, 16 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 %endif %endmacro @@ -1196,7 +1234,7 @@ INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 .pass1: - vpbroadcastd m10, [pd_6144] + vpbroadcastd m10, [pd_3072] mova m1, [cq+32*2] mova m3, [cq+32*6] mova m5, [cq+32*3] @@ -1241,7 +1279,7 @@ ALIGN function_align vpbroadcastd m4, [pd_3784] vpbroadcastd m8, [pd_1567] vpbroadcastd m9, [pd_2048] - vpbroadcastd m6, [pd_2896] + vpbroadcastd m6, [pd_1448] ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h ret @@ -1253,7 +1291,7 @@ ALIGN function_align psubd m0, m2 paddd m9, m4, m6 psubd m4, m6 - REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h psubd m2, m0, m1 paddd m1, m0 psubd m6, m4, m5 @@ -1304,7 +1342,6 @@ INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 -.pass1: call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_6144] call m(iadst_16x4_internal_10bpc).main_end @@ -1545,7 +1582,6 @@ INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 -.pass1: vpbroadcastd m7, [pd_5793] pmulld m0, m7, [cq+32*0] pmulld m4, m7, [cq+32*1] @@ -1678,7 +1714,16 @@ INV_TXFM_4X16_FN adst, flipadst, 12 INV_TXFM_4X16_FN adst, identity, 12 cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iadst_4x16_internal_10bpc).pass1 + call .main_pass1 + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + psrad m6, 12 + psrad m7, 12 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -1740,6 +1785,22 @@ ALIGN function_align vperm2i128 m4, m8, m9, 0x20 ; 8 10 vperm2i128 m6, m8, m9, 0x31 ; 12 14 ret +ALIGN function_align +.main_pass1: + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_3072] + paddd m10, m4, m5 + psubd m4, m3 + psubd m5, m3 + paddd m3, m10 + psubd m8, m7, m1 + paddd m7, m9 + psubd m9, m1 + paddd m7, m1 + REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 + REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 + paddd m6, m0 + ret INV_TXFM_4X16_FN flipadst, dct, 12 INV_TXFM_4X16_FN flipadst, adst, 12 @@ -1747,7 +1808,16 @@ INV_TXFM_4X16_FN flipadst, flipadst, 12 INV_TXFM_4X16_FN flipadst, identity, 12 cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iflipadst_4x16_internal_10bpc).pass1 + call m(iadst_4x16_internal_12bpc).main_pass1 + psrad m0, m3, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + psrad m3, m4, 12 + psrad m4, m7, 12 + psrad m5, m6, 12 + psrad m6, m9, 12 + psrad m7, m8, 12 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -1772,17 +1842,49 @@ INV_TXFM_4X16_FN identity, flipadst, 12 INV_TXFM_4X16_FN identity, identity, 12 cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iidentity_4x16_internal_10bpc).pass1 + vpbroadcastd m8, [pd_1697] + mova m0, [cq+32*0] + mova m4, [cq+32*1] + mova m1, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m9, [pd_6144] + pmulld m2, m8, m0 + pmulld m6, m8, m4 + pmulld m3, m8, m1 + pmulld m7, m8, m5 + mova m10, [cq+32*4] + mova m11, [cq+32*5] + mova m12, [cq+32*6] + mova m13, [cq+32*7] + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m0, m2 + pmulld m2, m8, m10 + paddd m4, m6 + pmulld m6, m8, m11 + paddd m1, m3 + pmulld m3, m8, m12 + paddd m5, m7 + pmulld m7, m8, m13 + REPX {psrad x, 1 }, m0, m4, m1, m5 + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m2, m10 + paddd m6, m11 + paddd m3, m12 + paddd m7, m13 + REPX {psrad x, 1 }, m2, m6, m3, m7 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 - vpbroadcastd m8, [pd_11586] - vpbroadcastd m9, [pd_2048] + vpbroadcastd m8, [pd_5793] + vpbroadcastd m9, [pd_1024] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 @@ -1795,37 +1897,21 @@ cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x4, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 mov [cq], eobd ; 0 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 - movd xm0, r6d - vpbroadcastw m0, xm0 -.end: - vpbroadcastd m4, [pixel_%3bpc_max] - pxor m3, m3 - mova xm1, [dstq+strideq*0] - vinserti128 m1, [dstq+strideq*1], 1 - lea r6, [dstq+strideq*2] - mova xm2, [r6 +strideq*0] - vinserti128 m2, [r6 +strideq*1], 1 - paddw m1, m0 - paddw m2, m0 - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 - mova [dstq+strideq*0], xm1 - vextracti128 [dstq+strideq*1], m1, 1 - mova [r6 +strideq*0], xm2 - vextracti128 [r6 +strideq*1], m2, 1 - RET + or r3d, 4 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +%else + jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly +%endif %endif %endmacro @@ -1960,32 +2046,7 @@ ALIGN function_align REPX {paddd x, m4}, m0, m3, m2, m1 REPX {psrad x, 12}, m0, m3, m2, m1 .main2: - vbroadcasti128 m6, [pd_1321] - vbroadcasti128 m7, [pd_2482] - pmulld m4, m0, m6 ; 1321*in0 - pmulld m5, m3, m7 ; 2482*in3 - paddd m4, m5 ; 1321*in0 + 2482*in3 - pmulld m5, m0, m7 ; 2482*in0 - paddd m0, m3 ; in0 + in3 - paddd m7, m6 ; pd_3803 - pmulld m6, m2 ; 1321*in2 - pmulld m3, m7 ; 3803*in3 - pmulld m7, m2 ; 3803*in2 - psubd m2, m0 ; in2 - in0 - in3 - vpbroadcastd m0, [pd_m3344] - psubd m5, m6 ; 2482*in0 - 1321*in2 - vpbroadcastd m6, [pd_2048] - psubd m5, m3 ; t1 - pmulld m2, m0 ; t2 - pmulld m1, m0 ; -t3 - paddd m4, m7 ; t0 - paddd m5, m6 - paddd m3, m4, m5 - paddd m4, m6 - psubd m4, m1 ; out0 (unshifted) - psubd m5, m1 ; out1 (unshifted) - paddd m2, m6 ; out2 (unshifted) - paddd m3, m1 ; out3 (unshifted) + IADST4_1D ret INV_TXFM_8X4_FN flipadst, dct @@ -2103,10 +2164,13 @@ cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call .pass2_main - psrad m0, m4, 12 - psrad m1, m5, 12 - psrad m2, 12 - psrad m3, 12 + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass2_end: + REPX {psrad x, 12}, m0, m1, m2, m3 .end: vpbroadcastd m4, [pw_16384] REPX {psrad x, 3}, m0, m1, m2, m3 @@ -2162,11 +2226,12 @@ cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call m(iadst_8x4_internal_12bpc).pass2_main - psrad m0, m3, 12 - psrad m3, m4, 12 - psrad m1, m2, 12 - psrad m2, m5, 12 - jmp m(iadst_8x4_internal_12bpc).end + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m3, m5, m4 + paddd m2, m5, m6 + jmp m(iadst_8x4_internal_12bpc).pass2_end INV_TXFM_8X4_FN identity, dct, 12 INV_TXFM_8X4_FN identity, adst, 12 @@ -2197,32 +2262,36 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 %macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x8, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 - mov [cq], eobd ; 0 - mov r3d, 8 + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 .dconly: - add r6d, 6144 - sar r6d, 13 + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d + paddsw xm0, xm2 vpbroadcastw m0, xm0 - vpbroadcastd m3, [pixel_%3bpc_max] - pxor m2, m2 .dconly_loop: mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 - paddw m1, m0 - pmaxsw m1, m2 - pminsw m1, m3 + paddsw m1, m0 + psubusw m1, m2 mova [dstq+strideq*0], xm1 vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET +%else + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly +%endif %endif %endmacro @@ -2245,7 +2314,7 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a psubd m%10, m%7, m%9 ; t7 paddd m%7, m%9 ; out6 - vpbroadcastd m%9, [pd_2896] + vpbroadcastd m%9, [pd_1448] psubd m%4, m%8, m%6 ; t3 paddd m%8, m%6 ; -out7 psubd m%6, m%1, m%3 ; t2 @@ -2255,10 +2324,10 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 - psubd m%5, m%6, m%4 ; (t2 - t3) * 2896 - paddd m%4, m%6 ; (t2 + t3) * 2896 - psubd m%6, m%3, m%10 ; (t6 - t7) * 2896 - paddd m%3, m%10 ; (t6 + t7) * 2896 + psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 + paddd m%4, m%6 ; (t2 + t3) * 1448 + psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 + paddd m%3, m%10 ; (t6 + t7) * 1448 %endmacro INV_TXFM_8X8_FN dct, dct @@ -2430,8 +2499,8 @@ ALIGN function_align vpbroadcastd m11, [pd_2048] .main2: IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 - psrld m8, 11 ; pd_1 - vpbroadcastd m9, [pd_6144] + psrld m8, 10 ; pd_1 + vpbroadcastd m9, [pd_3072] ret ALIGN function_align .main_end: @@ -2440,14 +2509,14 @@ ALIGN function_align paddd m6, m8 psubd m7, m8, m7 REPX {psrad x, 1 }, m0, m1, m6, m7 - ; (1 + ((x + 2048) >> 12)) >> 1 = (6144 + x) >> 13 - ; (1 - ((x + 2048) >> 12)) >> 1 = (6143 - x) >> 13 - psubd m8, m9, m8 ; pd_6143 + ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 + ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 + psubd m8, m9, m8 ; pd_3071 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 - REPX {psrad x, 13}, m2, m3, m4, m5 + REPX {psrad x, 12}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct @@ -2496,10 +2565,10 @@ ALIGN function_align paddd m5, m9, m2 psubd m2, m8, m3 paddd m3, m9, m4 - psrad m4, m2, 13 - psrad m2, m10, 13 - psrad m3, 13 - psrad m5, 13 + psrad m4, m2, 12 + psrad m2, m10, 12 + psrad m3, 12 + psrad m5, 12 ret INV_TXFM_8X8_FN identity, dct @@ -2681,13 +2750,13 @@ ALIGN function_align paddd m6, m9 psubd m7, m9, m7 REPX {psrad x, 4}, m0, m1, m6, m7 - vpbroadcastd m9, [pd_34816] - psubd m8, m9, m8 ; 34815 + vpbroadcastd m9, [pd_17408] + psubd m8, m9, m8 ; 17407 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 - REPX {psrad x, 16}, m2, m3, m4, m5 + REPX {psrad x, 15}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct, 12 @@ -2729,13 +2798,14 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 8x16, %4 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_%4bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_8x8_%4bpc).dconly + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 %endif %endmacro @@ -2904,7 +2974,7 @@ ALIGN function_align vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 - ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2 psubd m3, m1, m4 ; t10 paddd m1, m4 ; t9 psubd m4, m0, m2 ; t11a @@ -3269,7 +3339,7 @@ cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_end: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 jmp m(idct_8x16_internal_12bpc).end ALIGN function_align .pass2_main: @@ -3302,9 +3372,9 @@ ALIGN function_align pmaxsd m7, m13, [cq+32* 3] ; 3 REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_16x8_internal_10bpc).main_part2 - vpbroadcastd m14, [pd_34816] + vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 - psubd m13, m14, m15 ; pd_34815 + psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret @@ -3357,49 +3427,52 @@ ALIGN function_align m8, m9, m10, m11, m12, m13, m14 pminsd m15, [cq] mova [cq], m7 - vpbroadcastd m7, [pd_11586] + vpbroadcastd m7, [pd_5793] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 pmulld m7, [cq] mova [cq], m15 - vpbroadcastd m15, [pd_2048] + vpbroadcastd m15, [pd_1024] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [cq] - REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ret %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x4, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 - mov [cq], eobd ; 0 - mov r3d, 4 + vpbroadcastd m3, [dconly_%3bpc] +%if %3 = 10 .dconly: - add r6d, 6144 - sar r6d, 13 + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d + paddsw xm0, xm3 vpbroadcastw m0, xm0 - vpbroadcastd m4, [pixel_%3bpc_max] - pxor m3, m3 .dconly_loop: - paddw m1, m0, [dstq+strideq*0] - paddw m2, m0, [dstq+strideq*1] - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 + paddsw m1, m0, [dstq+strideq*0] + paddsw m2, m0, [dstq+strideq*1] + psubusw m1, m3 + psubusw m2, m3 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET +%else + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly +%endif %endif %endmacro @@ -3480,13 +3553,30 @@ ALIGN function_align .pass1_main2: ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 - psubd m4, m10, m5 ; t9 -t10 + vbroadcasti128 m12, [pd_3784_m3784] + psubd m4, m10, m5 paddd m10, m5 ; t8 t11 - psubd m5, m11, m6 ; t14 -t13 + psignd m4, m12 ; t9 t10 + psubd m5, m11, m6 paddd m11, m6 ; t15 t12 - REPX {pmaxsd x, m8}, m4, m5, m10, m11 - REPX {pminsd x, m9}, m4, m5, m10, m11 - ITX_MULSUB_2D 5, 4, 6, 12, 13, 7, 1567, 3784, 2 + psignd m5, m12 ; t14 t13 + vpbroadcastd m6, [pd_1567] + vpbroadcastd m13, [pd_3784] + REPX {pmaxsd x, m8}, m5, m4 + REPX {pminsd x, m9}, m5, m4 + pmulld m12, m5 + pmulld m5, m6 + vbroadcasti128 m6, [pd_1567_m1567] + pmulld m13, m4 + pmulld m4, m6 + REPX {pmaxsd x, m8}, m10, m11, m0, m1 + REPX {pminsd x, m9}, m10, m11, m0, m1 + paddd m12, m7 + paddd m5, m7 + paddd m4, m12 + psubd m5, m13 + psrad m4, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a vpbroadcastd m12, [pd_2896] punpckhqdq m6, m11, m5 punpcklqdq m11, m4 @@ -3500,8 +3590,8 @@ ALIGN function_align REPX {pminsd x, m9}, m5, m6 pmulld m5, m12 pmulld m6, m12 - REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10 - REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10 + REPX {pmaxsd x, m8}, m2, m3, m11, m10 + REPX {pminsd x, m9}, m2, m3, m11, m10 ret ALIGN function_align .pass1_main3: @@ -3565,10 +3655,10 @@ cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp m(idct_16x4_internal_10bpc).end ALIGN function_align .main: - vbroadcasti128 m6, [pd_1321] + vpbroadcastd m6, [pd_1321] mova m0, [cq+32*0] mova m1, [cq+32*1] - vbroadcasti128 m7, [pd_2482] + vpbroadcastd m7, [pd_2482] mova m2, [cq+32*6] mova m3, [cq+32*7] pmulld m4, m0, m6 @@ -3663,8 +3753,7 @@ INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 -.pass1: - vpbroadcastd m8, [pd_11586] + vpbroadcastd m8, [pd_5793] vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m1, [cq+32*1], q3120 ; 2 3 vpermq m2, [cq+32*2], q3120 ; 4 5 @@ -3673,10 +3762,10 @@ cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpermq m5, [cq+32*5], q3120 ; a b vpermq m6, [cq+32*6], q3120 ; c d vpermq m7, [cq+32*7], q3120 ; e f - vpbroadcastd m9, [pd_6144] + vpbroadcastd m9, [pd_3072] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call m(idct_16x4_internal_10bpc).transpose_4x16_packed @@ -3729,17 +3818,15 @@ cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 pmulld m2, m6, m11 pmulld m4, m6, m12 pmulld m6, m13 - vpbroadcastd m10, [pd_2048] + vpbroadcastd m10, [pd_17408] call m(idct_4x16_internal_10bpc).pass1_main2 - REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 - vpbroadcastd m4, [pw_16384] vpbroadcastd m5, [pixel_12bpc_max] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 - REPX {pmulhrsw x, m4}, m0, m1, m2, m3 jmp m(idct_16x4_internal_10bpc).end2 INV_TXFM_16X4_FN adst, dct, 12 @@ -3824,7 +3911,37 @@ INV_TXFM_16X4_FN identity, flipadst, 12 INV_TXFM_16X4_FN identity, identity, 12 cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iidentity_16x4_internal_10bpc).pass1 + vpbroadcastd m8, [pd_1697] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpbroadcastd m9, [pd_3072] + pmulld m4, m8, m0 + pmulld m5, m8, m1 + pmulld m6, m8, m2 + pmulld m7, m8, m3 + vpermq m10, [cq+32*4], q3120 ; 8 9 + vpermq m11, [cq+32*5], q3120 ; a b + vpermq m12, [cq+32*6], q3120 ; c d + vpermq m13, [cq+32*7], q3120 ; e f + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m0, m4 + pmulld m4, m8, m10 + paddd m1, m5 + pmulld m5, m8, m11 + paddd m2, m6 + pmulld m6, m8, m12 + paddd m3, m7 + pmulld m7, m8, m13 + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m4, m10 + paddd m5, m11 + paddd m6, m12 + paddd m7, m13 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -3844,13 +3961,14 @@ cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x8, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%3bpc] mov [cq], eobd ; 0 - mov r3d, 8 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_16x4_%3bpc).dconly + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 %endif %endmacro @@ -4013,13 +4131,13 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 .pass1: lea r6, [rsp+32*4] call .main - vpbroadcastd m14, [pd_6144] + vpbroadcastd m14, [pd_3072] psrld m15, 11 ; pd_1 - psubd m13, m14, m15 ; pd_6143 + psubd m13, m14, m15 ; pd_3071 call .pass1_rotations .pass1_end: REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 jmp tx2q .pass2: call m(idct_16x8_internal_10bpc).transpose @@ -4127,8 +4245,6 @@ ALIGN function_align pmaxsd m10, m13 pminsd m9, m14 pminsd m10, m14 - pmulld m9, m15 - pmulld m10, m15 mova [r6-32*4], m1 mova m11, [r6-32*1] ; t7a mova m1, [r6-32*2] ; t6a @@ -4140,7 +4256,6 @@ ALIGN function_align pmaxsd m2, m13 pminsd m8, m14 pminsd m2, m14 - pmulld m8, m15 mova [r6-32*1], m11 mova [r6-32*3], m2 mova m1, [r6+32*3] ; t15 @@ -4153,8 +4268,6 @@ ALIGN function_align pmaxsd m11, m13 pminsd m7, m14 pminsd m11, m14 - pmulld m7, m15 - pmulld m11, m15 mova [r6-32*2], m12 pminsd m1, m14, [r6+32*0] ; t10a pminsd m12, m14, [r6+32*1] ; t11a @@ -4162,13 +4275,13 @@ ALIGN function_align paddd m1, m4 ; -out1 psubd m4, m5, m12 ; t11 paddd m5, m12 ; out14 - pmulld m12, m15, [r6-32*3] ; t6 + vpbroadcastd m12, [pd_1448] pmaxsd m6, m13 pmaxsd m4, m13 pminsd m6, m14 pminsd m4, m14 - pmulld m6, m15 - pmulld m4, m15 + REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 + pmulld m12, [r6-32*3] ; t6 mova [r6-32*3], m5 paddd m5, m11, m7 ; -out5 (unshifted) psubd m11, m7 ; out10 (unshifted) @@ -4233,7 +4346,7 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 .pass1: lea r6, [rsp+32*4] call m(iadst_16x8_internal_10bpc).main - vpbroadcastd m14, [pd_6144] + vpbroadcastd m14, [pd_3072] psrld m15, 11 psubd m13, m14, m15 call .pass1_rotations @@ -4313,16 +4426,16 @@ cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 mova [rsp], m15 - vpbroadcastd m15, [pd_11586] + vpbroadcastd m15, [pd_5793] REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 pmulld m15, [rsp] mova [rsp], m7 - vpbroadcastd m7, [pd_6144] + vpbroadcastd m7, [pd_3072] REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [rsp] - REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: @@ -4340,6 +4453,10 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_max] jmp m(idct_16x8_internal_10bpc).pass1 .pass2: + call .pass2_main + RET +ALIGN function_align +.pass2_main: call m(idct_8x16_internal_12bpc).transpose vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -4383,8 +4500,7 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpermq m1, m5, q3120 vpermq m2, m6, q3120 vpermq m3, m7, q3120 - call m(idct_16x8_internal_10bpc).write_16x4_zero - RET + jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .write_16x4_start: vpbroadcastd m9, [pixel_12bpc_max] @@ -4403,7 +4519,8 @@ cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iadst_16x8_internal_10bpc).pass1 .pass2: call .pass2_main - jmp m(idct_16x8_internal_12bpc).end + call m(idct_16x8_internal_12bpc).end + RET ALIGN function_align .pass2_main: call m(idct_8x16_internal_12bpc).transpose @@ -4483,12 +4600,13 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 %macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 16x16, %4 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%4bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 10240 - sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_16x4_%4bpc).dconly2 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 %endif %endmacro @@ -4756,17 +4874,17 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 add cq, 32 call .main sub cq, 32 - vpbroadcastd m8, [pd_10240] + vpbroadcastd m8, [pd_5120] paddd m4, m8 paddd m6, m8 paddd m9, m8 paddd m11, m8 - vpbroadcastd m8, [pd_10239] + vpbroadcastd m8, [pd_5119] psubd m5, m8, m5 psubd m7, m8, m7 psubd m10, m8, m10 psubd m12, m8, m12 - REPX {psrad x, 14}, m4, m5, m6, m7, m9, m10, m11, m12 + REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 @@ -4797,8 +4915,8 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 .fast: add r6, 32*8 call .main - vpbroadcastd m14, [pd_10240] - vpbroadcastd m13, [pd_10239] + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 paddd m0, m15 psubd m1, m15, m1 @@ -4818,7 +4936,7 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 psubd m15, [r6-32*4] .pass1_end: REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 sub r6, 32*8 jmp tx2q .pass2: @@ -4892,17 +5010,17 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx add cq, 32 call m(iadst_16x16_internal_10bpc).main sub cq, 32 - vpbroadcastd m8, [pd_10240] + vpbroadcastd m8, [pd_5120] paddd m11, m8 paddd m9, m8 paddd m6, m8 paddd m4, m8 - vpbroadcastd m8, [pd_10239] + vpbroadcastd m8, [pd_5119] psubd m12, m8, m12 psubd m10, m8, m10 psubd m7, m8, m7 psubd m5, m8, m5 - REPX {psrad x, 14}, m12, m11, m10, m9, m7, m6, m5, m4 + REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 mova [r6+32*0], m12 mova [r6+32*1], m11 mova [r6+32*2], m10 @@ -4933,8 +5051,8 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx .fast: add r6, 32*8 call m(iadst_16x16_internal_10bpc).main - vpbroadcastd m14, [pd_10240] - vpbroadcastd m13, [pd_10239] + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 psubd m8, m13, m7 paddd m7, m14, m9 @@ -4996,9 +5114,8 @@ INV_TXFM_16X16_FN identity, dct, -92 INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 -.pass1: - vpbroadcastd m15, [pd_11586] - vpbroadcastd m7, [pd_10240] + vpbroadcastd m15, [pd_5793] + vpbroadcastd m7, [pd_5120] lea r6, [rsp+32*4] sub eobd, 36 jl .fast @@ -5010,7 +5127,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx pmulld m3, m15, [cq+r3+32*39] add r6, 32*4 REPX {paddd x, m7}, m0, m1, m2, m3 - REPX {psrad x, 14}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 @@ -5038,7 +5155,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [cq] - REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: @@ -5203,7 +5320,7 @@ cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_part3: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 .end: packssdw m15, m14 packssdw m14, m13, m12 @@ -5320,15 +5437,15 @@ ALIGN function_align REPX {pminsd x, m14}, m1, m3, m4, m6 .pass2_fast2: call m(iadst_16x8_internal_10bpc).main_part2 - vpbroadcastd m14, [pd_34816] + vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 - psubd m13, m14, m15 ; pd_34815 + psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret ALIGN function_align .pass2_part2: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 @@ -5375,8 +5492,73 @@ cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx INV_TXFM_16X16_FN identity, dct, -92, 12 INV_TXFM_16X16_FN identity, identity, 0, 12 +%macro IDTX16_12BPC 1 ; src + pmulld m6, m7, m%1 + paddd m6, m15 + psrad m6, 12 + paddd m6, m%1 + psrad m%1, m6, 1 +%endmacro + cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 - jmp m(iidentity_16x16_internal_10bpc).pass1 + vpbroadcastd m7, [pd_1697] + vpbroadcastd m15, [pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + mova m10, [cq+r3+32*33] + mova m11, [cq+r3+32*35] + mova m12, [cq+r3+32*37] + mova m13, [cq+r3+32*39] + add r6, 32*4 + pmulld m0, m7, m10 + pmulld m1, m7, m11 + pmulld m2, m7, m12 + pmulld m3, m7, m13 + REPX {paddd x, m15}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + paddd m0, m10 + paddd m1, m11 + paddd m2, m12 + paddd m3, m13 + REPX {psrad x, 1 }, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + mova m0, [cq+64* 0] + mova m1, [cq+64* 1] + mova m2, [cq+64* 2] + mova m3, [cq+64* 3] + mova m4, [cq+64* 4] + mova m5, [cq+64* 5] + mova m8, [cq+64* 6] + mova m9, [cq+64* 7] + REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9 + mova [cq+64*0], m8 + mova [cq+64*1], m9 + mova m8, [cq+64* 8] + mova m9, [cq+64* 9] + mova m10, [cq+64*10] + mova m11, [cq+64*11] + mova m12, [cq+64*12] + mova m13, [cq+64*13] + mova m14, [cq+64*14] + REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14 + mova m6, [cq+64*15] + pmulld m7, m6 + paddd m7, m15 + psrad m7, 12 + paddd m7, m6 + mova m6, [cq+64*0] + psrad m15, m7, 1 + mova m7, [cq+64*1] + jmp tx2q .pass2: call m(iidentity_8x16_internal_12bpc).pass2_main call m(idct_16x16_internal_10bpc).transpose_fast @@ -5429,7 +5611,7 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx call m(idct_16x16_internal_12bpc).write_16x16 RET -%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift +%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack mova m%4, [r6+32*(%1-4)] mova m%2, [r5+32*(3-%1)] mova m%5, [r4+32*(%1-4)] @@ -5446,8 +5628,10 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx paddd m%2, m%3, m%5 ; out15 - n psubd m%3, m%5 ; out16 + n REPX {psrad x, %6}, m%1, m%3, m%2, m%4 +%if %7 & 1 packssdw m%1, m%3 ; out0 + n, out16 + n packssdw m%2, m%4 ; out15 - n, out31 - n +%endif %endmacro cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob @@ -5574,14 +5758,15 @@ cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 - add r6d, 10240 - sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 ALIGN function_align -.pass1_main: +.pass1_main_part1: mova m0, [cq+128*0] mova m1, [cq+128*1] mova m2, [cq+128*2] @@ -5590,7 +5775,6 @@ ALIGN function_align mova m5, [cq+128*5] mova m6, [cq+128*6] mova m7, [cq+128*7] - add cq, 32 call m(idct_8x8_internal_10bpc).main psrld m1, m11, 10 ; pd_2 REPX {paddd x, m1}, m0, m6, m5, m3 @@ -5603,6 +5787,11 @@ ALIGN function_align psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret +ALIGN function_align +.pass1_main: + call .pass1_main_part1 + add cq, 32 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 @@ -5665,7 +5854,7 @@ ALIGN function_align vpbroadcastd m15, [pd_4017] vpbroadcastd m10, [pd_799] ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a - ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a psubd m3, m0, m6 ; t19a paddd m0, m6 ; t16a psubd m6, m7, m1 ; t28a @@ -5734,7 +5923,7 @@ ALIGN function_align vpbroadcastd m15, [pd_2276] vpbroadcastd m10, [pd_3406] ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a - ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a psubd m3, m0, m6 ; t27a paddd m0, m6 ; t24a psubd m6, m7, m1 ; t20a @@ -5747,8 +5936,8 @@ ALIGN function_align REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] - ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a - ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20 mova m9, [r6-32*4] ; t16a mova m10, [r6-32*3] ; t17 psubd m2, m9, m7 ; t23 @@ -5881,8 +6070,9 @@ ALIGN function_align ret cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob - vpbroadcastd m5, [pw_5] vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_5] pxor m6, m6 mov r6d, eobd add eobb, 21 @@ -5947,30 +6137,262 @@ ALIGN function_align vextracti128 [dstq+r4 ], m3, 1 ret +cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + mov r4, cq + lea r6, [rsp+32*4] + call .pass1_main + cmp eobd, 43 + jge .eob43 + jmp .pass2_fast +.eob43: + call .pass1_main + cmp eobd, 107 + jge .eob107 +.pass2_fast: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(idct_8x16_internal_10bpc).main_oddhalf_fast + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + jmp .pass2_end +.eob107: + call .pass1_main + cmp eobd, 171 + jge .eob171 + jmp .pass2 +.eob171: + call .pass1_main +.pass2: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + pmaxsd m4, m12, [cq+128*1+64] + pmaxsd m5, m12, [cq+128*7+64] + pmaxsd m6, m12, [cq+128*1+96] + pmaxsd m7, m12, [cq+128*7+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + pmaxsd m4, m12, [cq+128*3+64] + pmaxsd m5, m12, [cq+128*5+64] + pmaxsd m6, m12, [cq+128*3+96] + pmaxsd m7, m12, [cq+128*5+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + pmaxsd m4, m12, [cq+128*2+64] + pmaxsd m5, m12, [cq+128*6+64] + pmaxsd m6, m12, [cq+128*2+96] + pmaxsd m7, m12, [cq+128*6+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + pmaxsd m4, m12, [cq+128*0+64] + pmaxsd m5, m12, [cq+128*4+64] + pmaxsd m6, m12, [cq+128*0+96] + pmaxsd m7, m12, [cq+128*4+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf +.pass2_end: + psrld m11, 8 ; pd_8 + IDCT32_END 0, 15, 8, 9, 10, 4 + IDCT32_END 1, 14, 8, 9, 10, 4 + punpckhqdq m8, m0, m1 ; 16 17 (interleaved) + punpcklqdq m0, m1 ; 0 1 (interleaved) + punpcklqdq m1, m14, m15 ; 14 15 (interleaved) + punpckhqdq m14, m15 ; 30 31 (interleaved) + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 4 + IDCT32_END 3, 14, 8, 9, 10, 4 + punpckhqdq m8, m2, m3 ; 18 19 (interleaved) + punpcklqdq m2, m3 ; 2 3 (interleaved) + punpcklqdq m3, m14, m15 ; 12 13 (interleaved) + punpckhqdq m14, m15 ; 28 29 (interleaved) + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 4 + IDCT32_END 5, 14, 8, 9, 10, 4 + punpckhqdq m8, m4, m5 ; 20 21 (interleaved) + punpcklqdq m4, m5 ; 4 5 (interleaved) + punpcklqdq m5, m14, m15 ; 10 11 (interleaved) + punpckhqdq m14, m15 ; 26 27 (interleaved) + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 4 + IDCT32_END 7, 14, 8, 9, 10, 4 + punpckhqdq m8, m6, m7 ; 22 23 (interleaved) + punpcklqdq m6, m7 ; 6 7 (interleaved) + punpcklqdq m7, m14, m15 ; 8 9 (interleaved) + punpckhqdq m14, m15 ; 24 25 (interleaved) + mova [r5-32*3], m8 + mova [r5-32*4], m14 + mova m15, m1 +.end: + vpermq m0, m0, q3120 + vpermq m1, m2, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m6, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m7, q3120 + vpermq m1, m5, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m3, q3120 + vpermq m1, m15, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*3], q3120 + vpermq m1, [r5+32*1], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*1], q3120 + vpermq m1, [r5-32*3], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*4], q3120 + vpermq m1, [r5-32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*0], q3120 + vpermq m1, [r5+32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +ALIGN function_align +.pass1_main: + call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1 + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 32 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2, 0 + mova [cq+32*16], m8 + mova [cq+32*31], m9 + IDCT32_END 1, 14, 8, 9, 10, 2, 0 + mova [cq+32*17], m8 + mova [cq+32*30], m9 + mova [cq+32*14], m14 + IDCT32_END 2, 14, 8, 9, 10, 2, 0 + mova [cq+32*18], m8 + mova [cq+32*29], m9 + mova [cq+32*13], m14 + IDCT32_END 3, 14, 8, 9, 10, 2, 0 + mova [cq+32*19], m8 + mova [cq+32*28], m9 + mova [cq+32*12], m14 + IDCT32_END 4, 14, 8, 9, 10, 2, 0 + mova [cq+32*20], m8 + mova [cq+32*27], m9 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + IDCT32_END 5, 10, 0, 1, 2, 2, 0 + mova [cq+32*21], m0 + mova [cq+32*26], m1 + IDCT32_END 6, 9, 0, 1, 2, 2, 0 + mova [cq+32*22], m0 + mova [cq+32*25], m1 + IDCT32_END 7, 8, 0, 1, 2, 2, 0 + mova [cq+32*23], m0 + mova [cq+32*24], m1 + mova m0, [cq+32* 0] + mova m1, [cq+32* 1] + mova m2, [cq+32* 2] + mova m11, m14 + mova m12, [cq+32*12] + mova m13, [cq+32*13] + mova m14, [cq+32*14] + ret + +cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1 + cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 8 + or r3d, 8 .dconly: - add r6d, 10240 - sar r6d, 14 + add r6d, 640 + sar r6d, 10 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d + paddsw xm0, xm3 vpbroadcastw m0, xm0 - vpbroadcastd m4, [pixel_10bpc_max] - pxor m3, m3 .dconly_loop: - paddw m1, m0, [dstq+32*0] - paddw m2, m0, [dstq+32*1] - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + psubusw m1, m3 + psubusw m2, m3 mova [dstq+32*0], m1 mova [dstq+32*1], m2 add dstq, strideq @@ -5979,6 +6401,39 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob RET .full: PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + call .pass1 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end + lea r6, [deint_shuf+128] + vpbroadcastd m11, [pw_2048] + mov r4, dstq + call .pass2 + mova m0, [r5+32*3] ; 16 17 + mova m1, [r5+32*2] ; 30 31 + mova m2, [r5+32*1] ; 18 19 + mova m3, [r5+32*0] ; 28 29 + mova m4, [r5-32*1] ; 20 21 + mova m5, [r5-32*2] ; 26 27 + mova m6, [r5-32*3] ; 22 23 + mova m7, [r5-32*4] ; 24 25 + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + lea dstq, [r4+32] + call .pass2 + RET +ALIGN function_align +.pass2: + call m(idct_16x8_internal_8bpc).main + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 + jmp m(idct_16x8_internal_10bpc).write_16x4_zero +ALIGN function_align +.pass1: mova m0, [cq+32* 1] mova m1, [cq+32* 7] mova m2, [cq+32* 9] @@ -5988,10 +6443,7 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob mova m6, [cq+32*25] mova m7, [cq+32*31] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_18b_min] - vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] - lea r6, [rsp+32*4] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 mova m0, [cq+32* 3] mova m1, [cq+32* 5] @@ -6021,37 +6473,12 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob mova m7, [cq+32*28] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf - call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end - lea r6, [deint_shuf+128] - vpbroadcastd m11, [pw_2048] - mov r4, dstq - call .pass2 - mova m0, [r5+32*3] ; 16 17 - mova m1, [r5+32*2] ; 30 31 - mova m2, [r5+32*1] ; 18 19 - mova m3, [r5+32*0] ; 28 29 - mova m4, [r5-32*1] ; 20 21 - mova m5, [r5-32*2] ; 26 27 - mova m6, [r5-32*3] ; 22 23 - mova m7, [r5-32*4] ; 24 25 - call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose - lea dstq, [r4+32] - call .pass2 - RET -ALIGN function_align -.pass2: - call m(idct_16x8_internal_8bpc).main - REPX {pmulhrsw x, m11}, m0, m1, m2, m3 - call m(idct_16x8_internal_10bpc).write_16x4_start - pmulhrsw m0, m11, m4 - pmulhrsw m1, m11, m5 - pmulhrsw m2, m11, m6 - pmulhrsw m3, m11, m7 - jmp m(idct_16x8_internal_10bpc).write_16x4_zero + ret cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob - vpbroadcastd m5, [pw_4096] vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_4096] pxor m6, m6 mov r6d, eobd add eobb, 21 @@ -6078,6 +6505,47 @@ cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob jge .loop RET +cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 8 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1 + call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end + mov r4, dstq + call m(idct_16x8_internal_12bpc).pass2_main + mova m0, [cq+32* 0] ; 16 + mova m1, [cq+32* 1] ; 17 + mova m2, [cq+32* 2] ; 18 + mova m3, [cq+32* 3] ; 19 + mova m4, [cq+32* 4] ; 20 + mova m5, [cq+32* 5] ; 21 + mova m6, [cq+32* 6] ; 22 + mova m7, [cq+32* 7] ; 23 + mova m8, [cq+32* 8] ; 24 + mova m9, [cq+32* 9] ; 25 + mova m10, [cq+32*10] ; 26 + mova m11, [cq+32*11] ; 27 + mova m12, [cq+32*12] ; 28 + mova m13, [cq+32*13] ; 29 + mova m14, [cq+32*14] ; 30 + mova m15, [cq+32*15] ; 31 + lea dstq, [r4+32] + call m(idct_16x8_internal_12bpc).pass2_main + RET + +cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1 + %macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] mova m%4, [%2] paddsw m%3, m%1, m%4 @@ -6121,13 +6589,14 @@ cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 jmp .fast .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 .eob44: mova [r4+16*0], xm0 mova [r4+16*1], xm3 @@ -6472,14 +6941,15 @@ cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] jmp .end .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 6144 - sar r6d, 13 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .full: add cq, 32 @@ -6742,9 +7212,10 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .fast: lea r4, [rsp+32*71] @@ -7019,12 +7490,13 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 64 - add r6d, 10240 - sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 .fast: lea r4, [rsp+32*38] pxor m0, m0 @@ -7246,7 +7718,7 @@ ALIGN function_align REPX {pmaxsd x, m12}, m8, m1, m6, m2 REPX {pminsd x, m13}, m8, m1, m6, m2 ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a - ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a REPX {pmaxsd x, m12}, m0, m3, m7, m4 REPX {pminsd x, m13}, m0, m3, m7, m4 vpbroadcastd m10, [r5+4*10] @@ -7301,7 +7773,7 @@ ALIGN function_align REPX {pmaxsd x, m12}, m8, m1, m3, m4 REPX {pminsd x, m13}, m8, m1, m3, m4 ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a - ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a REPX {pmaxsd x, m12}, m0, m2, m5, m7 REPX {pminsd x, m13}, m0, m5, m2, m7 psubd m6, m2, m7 ; t48a @@ -7358,14 +7830,15 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 64 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 6144 - sar r6d, 13 + or r3d, 64 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .fast: lea r4, [rsp+32*70] @@ -7540,30 +8013,26 @@ ALIGN function_align cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .normal - imul r6d, [cq], 2896 + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 .dconly: - add r6d, 10240 - sar r6d, 14 + add r6d, 640 + sar r6d, 10 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + vpbroadcastd m5, [dconly_10bpc] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d -%if WIN64 - movaps [rsp+8], xmm6 -%endif + paddsw xm0, xm5 vpbroadcastw m0, xm0 - vpbroadcastd m6, [pixel_10bpc_max] - pxor m5, m5 .dconly_loop: - paddw m1, m0, [dstq+32*0] - paddw m2, m0, [dstq+32*1] - paddw m3, m0, [dstq+32*2] - paddw m4, m0, [dstq+32*3] - REPX {pmaxsw x, m5}, m1, m2, m3, m4 - REPX {pminsw x, m6}, m1, m2, m3, m4 + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + paddsw m3, m0, [dstq+32*2] + paddsw m4, m0, [dstq+32*3] + REPX {psubusw x, m5}, m1, m2, m3, m4 mova [dstq+32*0], m1 mova [dstq+32*1], m2 mova [dstq+32*2], m3 @@ -7571,9 +8040,6 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob add dstq, strideq dec r3d jg .dconly_loop -%if WIN64 - movaps xmm6, [rsp+8] -%endif RET .normal: PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob @@ -7814,14 +8280,14 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 32 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 6144 - sar r6d, 13 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 .fast: pxor m0, m0 @@ -7963,9 +8429,9 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly .fast: pxor m0, m0 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm new file mode 100644 index 00000000000..b05fde54dc8 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm @@ -0,0 +1,2599 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23 + db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31 + db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55 + db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63 +idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 + db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 + db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 + db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 +idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51 + db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59 + db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17 + db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25 +iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23 + db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31 + db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19 + db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27 +permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13 + db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29 + db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15 + db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31 +permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2 + db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6 + db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7 + db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3 +permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6 + db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14 + db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7 + db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15 +idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 + db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 + db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 + db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 +idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25 + db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57 + db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29 + db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61 +idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30 + db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62 + db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31 + db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63 + +pw_2048_m2048: times 16 dw 2048 +pw_m2048_2048: times 16 dw -2048 +pw_2048: times 16 dw 2048 + +; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++- +%macro COEF_PAIR 2-3 0 ; a, b, flags +%if %3 == 1 +pd_%1_m%2: dd %1, %1, -%2, -%2 +%define pd_%1 (pd_%1_m%2 + 4*0) +%define pd_m%2 (pd_%1_m%2 + 4*2) +%elif %3 == 2 +pd_m%1_%2: dd -%1, -%1, %2, %2 +%define pd_m%1 (pd_m%1_%2 + 4*0) +%define pd_%2 (pd_m%1_%2 + 4*2) +%else +pd_%1_%2: dd %1, %1, %2, %2 +%define pd_%1 (pd_%1_%2 + 4*0) +%define pd_%2 (pd_%1_%2 + 4*2) +%if %3 == 3 +%define pd_%2_m%2 pd_%2 +dd -%2, -%2 +%endif +%endif +%endmacro + +COEF_PAIR 201, 995 +COEF_PAIR 401, 1189, 1 +COEF_PAIR 401, 1931 +COEF_PAIR 401, 3920 +COEF_PAIR 799, 2276, 1 +COEF_PAIR 799, 3406 +COEF_PAIR 799, 4017 +COEF_PAIR 1380, 601 +COEF_PAIR 1751, 2440 +COEF_PAIR 2598, 1189 +COEF_PAIR 2598, 1931, 2 +COEF_PAIR 2598, 3612 +COEF_PAIR 2751, 2106 +COEF_PAIR 2896, 1567, 3 +COEF_PAIR 2896, 3784, 3 +COEF_PAIR 3035, 3513 +COEF_PAIR 3166, 1931 +COEF_PAIR 3166, 3612 +COEF_PAIR 3166, 3920 +COEF_PAIR 3703, 3290 +COEF_PAIR 3857, 4052 +COEF_PAIR 4017, 2276 +COEF_PAIR 4017, 3406 +COEF_PAIR 4076, 1189 +COEF_PAIR 4076, 3612 +COEF_PAIR 4076, 3920 +COEF_PAIR 4091, 3973 + +pw_5: times 2 dw 5 +pw_4096 times 2 dw 4096 +pw_1697x16: times 2 dw 1697*16 +pw_2896x8: times 2 dw 2896*8 +pixel_10bpc_max: times 2 dw 0x03ff +dconly_10bpc: times 2 dw 0x7c00 +clip_18b_min: dd -0x20000 +clip_18b_max: dd 0x1ffff +pd_1: dd 1 +pd_2: dd 2 +pd_1448: dd 1448 +pd_2048: dd 2048 +pd_3071: dd 3071 ; 1024 + 2048 - 1 +pd_3072: dd 3072 ; 1024 + 2048 +pd_5119: dd 5119 ; 1024 + 4096 - 1 +pd_5120: dd 5120 ; 1024 + 4096 +pd_5793: dd 5793 + +cextern int8_permA +cextern idct_8x8_internal_8bpc_avx512icl.main +cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2 +cextern idct_8x16_internal_8bpc_avx512icl.main +cextern idct_8x16_internal_8bpc_avx512icl.main2 +cextern idct_8x16_internal_8bpc_avx512icl.main_fast +cextern idct_8x16_internal_8bpc_avx512icl.main_fast2 +cextern iadst_8x16_internal_8bpc_avx512icl.main2 +cextern idct_16x8_internal_8bpc_avx512icl.main +cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2 +cextern idct_16x16_internal_8bpc_avx512icl.main +cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2 +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end +cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main + +SECTION .text + +%define o_base (pw_2048+4*128) +%define o_base_8bpc (int8_permA+64*18) +%define o(x) (r5 - o_base + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +INIT_ZMM avx512icl + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 1 = inv_dst1, 2 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else +%if %8 < 4096 + vpbroadcastd m%3, [o(pd_%8)] +%else + vbroadcasti32x4 m%3, [o(pd_%8)] +%endif + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else +%if %7 < 4096 + vpbroadcastd m%5, [o(pd_%7)] +%else + vbroadcasti32x4 m%5, [o(pd_%7)] +%endif + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 2 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%ifnum %6 + paddd m%1, m%6 +%endif +%if %9 & 1 + psubd m%1, m%3, m%1 +%else + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size +cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_10bpc) + lea r5, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%4_internal_10bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 8x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 +.dconly: + add r6d, 384 + sar r6d, 9 +.dconly2: + vpbroadcastd ym2, [o(dconly_10bpc)] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw ym1, r6d + paddsw ym1, ym2 +.dconly_loop: + mova xm0, [dstq+strideq*0] + vinserti32x4 ym0, [dstq+strideq*1], 1 + paddsw ym0, ym1 + psubusw ym0, ym2 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call .load + vpermi2q m1, m0, m2 ; 1 5 + vpermi2q m3, m6, m4 ; 7 3 + vpermt2q m0, m5, m4 ; 0 2 + vpermt2q m2, m5, m6 ; 4 6 + call .main + call .main_end + mova m4, [o(idct8x8p)] + packssdw m0, m2 ; 0 1 4 5 + packssdw m1, m3 ; 3 2 7 6 + vpermb m0, m4, m0 + vprolq m1, 32 + vpermb m2, m4, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + vextracti32x8 ym2, m0, 1 + vextracti32x8 ym3, m1, 1 + call m(idct_8x8_internal_8bpc).main + mova m10, [permC] + vpbroadcastd m12, [pw_2048] +.end: + vpermt2q m0, m10, m1 + vpermt2q m2, m10, m3 +.end2: + vpbroadcastd m11, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m10, m10 + pmulhrsw m8, m12, m0 + call .write_8x4_start + pmulhrsw m8, m12, m2 +.write_8x4: + lea dstq, [dstq+strideq*4] + add cq, 64*2 +.write_8x4_start: + mova xm9, [dstq+strideq*0] + vinserti32x4 ym9, [dstq+strideq*1], 1 + vinserti32x4 m9, [dstq+strideq*2], 2 + vinserti32x4 m9, [dstq+r6 ], 3 + mova [cq+64*0], m10 + mova [cq+64*1], m10 + paddw m9, m8 + pmaxsw m9, m10 + pminsw m9, m11 + mova [dstq+strideq*0], xm9 + vextracti32x4 [dstq+strideq*1], ym9, 1 + vextracti32x4 [dstq+strideq*2], m9, 2 + vextracti32x4 [dstq+r6 ], m9, 3 + ret +ALIGN function_align +.load: + mova m0, [cq+64*0] ; 0 1 + mova m4, [cq+64*1] ; 2 3 + mova m1, [o(permB)] + mova m2, [cq+64*2] ; 4 5 + mova m6, [cq+64*3] ; 6 7 + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psrlq m5, m1, 32 + vpbroadcastd m12, [o(pd_2896)] + mova m3, m1 + vpbroadcastd m11, [o(pd_1)] + ret +ALIGN function_align +.main_fast2: ; bottom three-quarters are zero + vbroadcasti32x4 m8, [o(pd_799_4017)] + pmulld m8, m1 ; t4 t7 + vpmulld m0, [o(pd_2896)] {1to16} ; dct4 out0 out1 + REPX {paddd x, m13}, m8, m0 + REPX {psrad x, 12 }, m8, m0 + pmulld m3, m8, m12 + mova m2, m0 ; dct4 out3 out2 + jmp .main3 +.main_fast: ; bottom half is zero + vbroadcasti32x4 m3, [o(pd_4017_3406)] + vbroadcasti32x4 m8, [o(pd_799_m2276)] + vbroadcasti32x4 m2, [o(pd_2896_3784)] + vbroadcasti32x4 m9, [o(pd_2896_1567)] + pmulld m3, m1 ; t4a t5a + pmulld m1, m8 ; t7a t6a + pmulld m2, m0 ; t0 t3 + pmulld m0, m9 ; t1 t2 + jmp .main2 +.main: + ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276 + ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784 +.main2: + REPX {paddd x, m13}, m1, m3, m0, m2 + REPX {psrad x, 12 }, m1, m3, m0, m2 + punpcklqdq m8, m1, m3 ; t4a t7a + punpckhqdq m1, m3 ; t5a t6a + psubd m3, m8, m1 ; t5a t6a + paddd m8, m1 ; t4 t7 + pmaxsd m3, m14 + punpckhqdq m1, m2, m0 ; t3 t2 + pminsd m3, m15 + punpcklqdq m2, m0 ; t0 t1 + pmulld m3, m12 + paddd m0, m2, m1 ; dct4 out0 out1 + psubd m2, m1 ; dct4 out3 out2 + REPX {pmaxsd x, m14}, m8, m0, m2 + REPX {pminsd x, m15}, m8, m0, m2 +.main3: + pshufd m1, m3, q1032 + paddd m3, m13 + psubd m9, m3, m1 + paddd m3, m1 + psrad m9, 12 + psrad m3, 12 + punpckhqdq m1, m8, m3 ; t7 t6 + shufpd m8, m9, 0xaa ; t4 t5 + ret +.main_end: + paddd m0, m11 + paddd m2, m11 + psubd m3, m0, m1 ; out7 out6 + paddd m0, m1 ; out0 out1 + paddd m1, m2, m8 ; out3 out2 + psubd m2, m8 ; out4 out5 + REPX {vpsravd x, m11}, m0, m2, m3, m1 + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity +INV_TXFM_8X8_FN adst, adst + +cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x8_internal_10bpc).load + vpermi2q m1, m6, m2 ; 7 5 + vpermi2q m3, m4, m0 ; 3 1 + vpermt2q m0, m5, m4 ; 0 2 + vpermt2q m2, m5, m6 ; 4 6 + call .main + punpckldq m1, m2, m4 ; out4 out6 + punpckhdq m2, m0 ; -out5 -out7 + punpckldq m0, m3 ; out0 out2 + punpckhdq m4, m3 ; -out1 -out3 + paddd m1, m11 + psubd m3, m11, m2 + paddd m0, m11 + psubd m4, m11, m4 +.pass1_end: + REPX {psrad x, 1}, m1, m0, m3, m4 + packssdw m0, m1 ; 0 2 4 6 + packssdw m4, m3 ; 1 3 5 7 + psrlq m1, [o(permB)], 8 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + psrlq m2, m1, 32 + vpermi2q m1, m0, m3 + vpermt2q m0, m2, m3 + jmp tx2q +.pass2: + call .main_pass2 + movu m10, [permC+2] + vbroadcasti32x8 m12, [pw_2048_m2048+16] + jmp m(idct_8x8_internal_10bpc).end +.main_pass2: + vextracti32x8 ym2, m0, 1 + vextracti32x8 ym3, m1, 1 + lea r5, [o_base_8bpc] + pshufd ym4, ym0, q1032 + pshufd ym5, ym1, q1032 + jmp m(iadst_8x8_internal_8bpc).main_pass2 +ALIGN function_align +.main: + ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612 + ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189 + psubd m4, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + REPX {pmaxsd x, m14}, m4, m2, m0, m1 + REPX {pminsd x, m15}, m4, m2, m0, m1 + pxor m5, m5 + psubd m5, m4 + shufpd m4, m2, 0xaa ; t4 t7 + shufpd m2, m5, 0xaa ; t5 -t6 + ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784 + punpckhqdq m3, m0, m1 + punpcklqdq m0, m1 + psubd m1, m0, m3 ; t2 t3 + paddd m0, m3 ; out0 -out7 + punpckhqdq m3, m4, m2 ; t7a t6a + punpcklqdq m4, m2 ; t5a t4a + psubd m2, m4, m3 ; t7 t6 + paddd m4, m3 ; out6 -out1 + REPX {pmaxsd x, m14}, m1, m2 + REPX {pminsd x, m15}, m1, m2 + shufpd m3, m1, m2, 0xaa + shufpd m1, m2, 0x55 + pmulld m3, m12 + pmulld m1, m12 + paddd m3, m13 + psubd m2, m3, m1 + paddd m3, m1 + psrad m2, 12 ; out4 -out5 + pshufd m3, m3, q1032 + psrad m3, 12 ; out2 -out3 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, identity +INV_TXFM_8X8_FN flipadst, flipadst + +cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x8_internal_10bpc).load + vpermi2q m1, m6, m2 ; 7 5 + vpermi2q m3, m4, m0 ; 3 1 + vpermt2q m0, m5, m4 ; 0 2 + vpermt2q m2, m5, m6 ; 4 6 + call m(iadst_8x8_internal_10bpc).main + punpckhdq m1, m3, m4 ; -out3 -out1 + punpckldq m3, m0 ; out2 out0 + punpckhdq m0, m2 ; -out7 -out5 + punpckldq m4, m2 ; out6 out4 + psubd m1, m11, m1 + paddd m3, m11 + psubd m0, m11, m0 + paddd m4, m11 + jmp m(iadst_8x8_internal_10bpc).pass1_end +.pass2: + call m(iadst_8x8_internal_10bpc).main_pass2 + movu m10, [permC+1] + vbroadcasti32x8 m12, [pw_m2048_2048+16] + lea r6, [strideq*3] + vpermt2q m0, m10, m1 ; 7 6 5 4 + vpbroadcastd m11, [pixel_10bpc_max] + vpermt2q m2, m10, m3 ; 3 2 1 0 + pxor m10, m10 + pmulhrsw m8, m12, m2 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m8, m12, m0 + jmp m(idct_8x8_internal_10bpc).write_8x4 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + mova m1, [cq+64*0] + packssdw m1, [cq+64*2] ; 0 4 1 5 + mova m2, [cq+64*1] ; 2 6 3 7 + packssdw m2, [cq+64*3] + mova m0, [o(idtx8x8p)] + vpermb m1, m0, m1 + vpermb m2, m0, m2 + punpckldq m0, m1, m2 ; 0 1 4 5 + punpckhdq m1, m2 ; 2 3 6 7 + jmp tx2q +.pass2: + movu m3, [o(permC+2)] + vpbroadcastd m12, [o(pw_4096)] + psrlq m2, m3, 32 + vpermi2q m2, m0, m1 + vpermt2q m0, m3, m1 + jmp m(idct_8x8_internal_10bpc).end2 + +%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 8x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, 35 +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, adst + +cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 43 + jl .fast + call .load + call .main + call .main_end +.pass1_end: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + jmp tx2q +.pass2: + mova m8, [o(idct8x16p)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3 + punpckhdq m5, m0, m1 + punpckldq m0, m1 + punpckhdq m4, m2, m3 + punpckldq m2, m3 + punpcklqdq m8, m0, m2 ; 15 1 + punpckhqdq m0, m2 ; 7 9 + punpckhqdq m1, m5, m4 ; 3 13 + punpcklqdq m5, m4 ; 11 5 + lea r5, [o_base_8bpc] + vextracti32x8 ym7, m8, 1 ; 14 2 + vextracti32x8 ym3, m0, 1 ; 6 10 + vextracti32x8 ym6, m1, 1 ; 12 4 + vextracti32x8 ym9, m5, 1 ; 8 0 + call m(idct_8x16_internal_8bpc).main2 + mova m8, [permC] + vpbroadcastd m12, [pw_2048] + vpermt2q m0, m8, m1 + lea r6, [strideq*3] + vpermt2q m2, m8, m3 + vpbroadcastd m11, [pixel_10bpc_max] + vpermt2q m4, m8, m5 + pxor m10, m10 + vpermt2q m6, m8, m7 + pmulhrsw m8, m12, m0 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m8, m12, m2 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m4 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m6 + jmp m(idct_8x8_internal_10bpc).write_8x4 +.fast: + mova ym0, [cq+64*0] + mova ym4, [cq+64*2] + mova ym1, [cq+64*1] + mova ym5, [cq+64*5] + mova ym2, [cq+64*4] + mova ym6, [cq+64*6] + mova ym3, [cq+64*7] + mova ym7, [cq+64*3] + call .round_input_fast + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_10bpc).main_end + movu m6, [o(permC+3)] + packssdw m3, m1, m3 + packssdw m1, m0, m2 + vprolq m3, 32 + vpermd m1, m6, m1 + vpermd m3, m6, m3 + mova ym0, ym1 ; 0 4 + vextracti32x8 ym1, m1, 1 ; 1 5 + mova ym2, ym3 ; 2 6 + vextracti32x8 ym3, m3, 1 ; 3 7 + jmp tx2q +ALIGN function_align +.round_input_fast: + movshdup m8, [o(permB)] + vpbroadcastd m12, [o(pd_2896)] + vpermt2q m0, m8, m4 + vpermt2q m1, m8, m5 + vpermt2q m2, m8, m6 + vpermt2q m3, m8, m7 + vpbroadcastd m13, [o(pd_2048)] + REPX {pmulld x, m12}, m0, m1, m2, m3 + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + REPX {paddd x, m13}, m0, m1, m2, m3 + vpbroadcastd m11, [o(pd_1)] + REPX {psrad x, 12 }, m0, m1, m2, m3 + ret +ALIGN function_align +.load: + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] +.load2: + vpbroadcastd m12, [o(pd_2896)] + pmulld m0, m12, [cq+64*0] + pmulld m1, m12, [cq+64*1] + pmulld m2, m12, [cq+64*2] + pmulld m3, m12, [cq+64*3] + vpbroadcastd m13, [o(pd_2048)] + pmulld m4, m12, [cq+64*4] + pmulld m5, m12, [cq+64*5] + pmulld m6, m12, [cq+64*6] + pmulld m7, m12, [cq+64*7] + REPX {paddd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret +ALIGN function_align +.main: + ITX_MULSUB_2D 5, 3, 8, 9, 10, 13, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 13, 799, 4017 ; t4a t7a + pmulld m0, m12 + pmulld m4, m12 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + psubd m5, m7, m3 ; t6a + paddd m7, m3 ; t7 + pmaxsd m5, m14 + pmaxsd m1, m14 + pminsd m5, m15 + pminsd m1, m15 + pmulld m5, m12 + pmulld m1, m12 + ITX_MULSUB_2D 2, 6, 3, 9, 10, 13, 1567, 3784 ; t2 t3 + pmaxsd m8, m14 + pmaxsd m7, m14 + paddd m0, m13 + pminsd m8, m15 + psubd m3, m0, m4 + paddd m5, m13 + paddd m0, m4 + psubd m4, m5, m1 + paddd m5, m1 + REPX {psrad x, 12 }, m3, m5, m0, m4 + paddd m1, m3, m2 ; dct4 out1 + psubd m2, m3, m2 ; dct4 out2 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + pminsd m6, m15, m7 + REPX {pmaxsd x, m14}, m0, m1, m2, m3 + REPX {pminsd x, m15}, m0, m1, m2, m3 + ret +.main_end: + vpbroadcastd m11, [o(pd_1)] +.main_end2: + REPX {paddd x, m11}, m0, m1, m2, m3 + psubd m7, m0, m6 ; out7 + paddd m0, m6 ; out0 + psubd m6, m1, m5 ; out6 + paddd m1, m5 ; out1 + psubd m5, m2, m4 ; out5 + paddd m2, m4 ; out2 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, identity, 35 +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, adst + +cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 43 + jl .fast + call m(idct_8x16_internal_10bpc).load + call .main + psrad m0, 1 + psrad m1, 1 + psrad m6, m10, 1 + psrad m7, m11, 1 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + jmp m(idct_8x16_internal_10bpc).pass1_end +.fast: + call .fast_main + punpcklqdq m1, m2, m4 ; out4 out6 + punpckhqdq m2, m0 ; -out5 -out7 + punpcklqdq m0, m3 ; out0 out2 + punpckhqdq m4, m3 ; -out1 -out3 + paddd m1, m11 + psubd m3, m11, m2 + paddd m0, m11 + psubd m4, m11, m4 +.fast_end: + movu m5, [o(permC+3)] + REPX {psrad x, 1}, m1, m0, m3, m4 + packssdw m2, m0, m1 ; 0 2 4 6 + packssdw m3, m4, m3 ; 1 3 5 7 + vpermd m2, m5, m2 + vpermd m3, m5, m3 + mova ym0, ym2 + vextracti32x8 ym2, m2, 1 + mova ym1, ym3 + vextracti32x8 ym3, m3, 1 + jmp tx2q +.pass2: + call .pass2_main + movu m4, [permB+2] + vbroadcasti32x8 m12, [pw_2048_m2048+16] + psrlq m7, m4, 8 + vpermi2q m4, m0, m3 ; 0 1 2 3 + psrlq m5, m7, 24 + vpermi2q m7, m0, m3 ; 12 13 14 15 + psrlq m6, m5, 8 + vpermq m5, m5, m1 ; 4 5 6 7 + vpermq m6, m6, m2 ; 8 9 10 11 +.pass2_end: + vpbroadcastd m11, [pixel_10bpc_max] + pxor m10, m10 + lea r6, [strideq*3] + pmulhrsw m8, m12, m4 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m8, m12, m5 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m6 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m7 + jmp m(idct_8x8_internal_10bpc).write_8x4 +ALIGN function_align +.main: + ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a + psubd m8, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m5, m1 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7 + REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7 + vpbroadcastd m10, [o(pd_1567)] + vpbroadcastd m11, [o(pd_3784)] + ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a + ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a + vpbroadcastd m12, [o(pd_1448)] + psubd m9, m6, m8 ; t7 + paddd m6, m8 ; out6 + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m2 ; t2 + paddd m0, m2 ; out0 + psubd m2, m1, m4 ; t6 + paddd m1, m4 ; -out1 + REPX {pmaxsd x, m14}, m5, m3, m2, m9 + REPX {pminsd x, m15}, m5, m3, m2, m9 + REPX {pmulld x, m12}, m5, m3, m2, m9 + vpbroadcastd m4, [o(pd_1)] + psubd m8, m5, m3 ; (t2 - t3) * 1448 + paddd m3, m5 ; (t2 + t3) * 1448 + psubd m5, m2, m9 ; (t6 - t7) * 1448 + paddd m2, m9 ; (t6 + t7) * 1448 + vpbroadcastd m9, [o(pd_3072)] + paddd m0, m4 + psubd m1, m4, m1 + paddd m10, m6, m4 + psubd m11, m4, m7 + paddd m2, m9 + paddd m8, m9 + vpbroadcastd m9, [o(pd_3071)] + psubd m3, m9, m3 + psubd m9, m5 + ret +ALIGN function_align +.fast_main: + mova ym0, [cq+64*0] + mova ym4, [cq+64*2] + mova ym1, [cq+64*7] + mova ym5, [cq+64*5] + mova ym2, [cq+64*4] + mova ym6, [cq+64*6] + mova ym3, [cq+64*3] + mova ym7, [cq+64*1] + call m(idct_8x16_internal_10bpc).round_input_fast + jmp m(iadst_8x8_internal_10bpc).main +ALIGN function_align +.pass2_main: + mova m8, [o(iadst8x16p)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3 + vpbroadcastd m10, [o(pw_2896x8)] + punpckhdq m5, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m2, m3 + punpckldq m2, m3 + lea r5, [o_base_8bpc] + punpckhqdq m4, m0, m2 ; 12 3 14 1 + punpcklqdq m0, m2 ; 0 15 2 13 + punpckhqdq m6, m5, m1 ; 8 7 10 5 + punpcklqdq m5, m1 ; 4 11 6 9 + call m(iadst_8x16_internal_8bpc).main2 + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m10 ; -out7 out4 out6 -out5 + pmulhrsw m2, m10 ; out8 -out11 -out9 out10 + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, identity, 35 +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst + +cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 43 + jl .fast + call m(idct_8x16_internal_10bpc).load + call m(iadst_8x16_internal_10bpc).main + psrad m7, m0, 1 + psrad m0, m11, 1 + psrad m6, m1, 1 + psrad m1, m10, 1 + psrad m5, m2, 12 + psrad m2, m9, 12 + psrad m4, m3, 12 + psrad m3, m8, 12 + jmp m(idct_8x16_internal_10bpc).pass1_end +.fast: + call m(iadst_8x16_internal_10bpc).fast_main + punpckhqdq m1, m3, m4 ; -out3 -out1 + punpcklqdq m3, m0 ; out2 out0 + punpckhqdq m0, m2 ; -out7 -out5 + punpcklqdq m4, m2 ; out6 out4 + psubd m1, m11, m1 + paddd m3, m11 + psubd m0, m11, m0 + paddd m4, m11 + jmp m(iadst_8x16_internal_10bpc).fast_end +.pass2: + call m(iadst_8x16_internal_10bpc).pass2_main + movu m7, [permB+2] + vbroadcasti32x8 m12, [pw_m2048_2048+16] + psrlq m4, m7, 8 + vpermi2q m7, m3, m0 ; 3 2 1 0 + psrlq m5, m4, 24 + vpermi2q m4, m3, m0 ; 15 14 13 12 + psrlq m6, m5, 8 + vpermq m5, m5, m2 ; 11 10 9 8 + vpermq m6, m6, m1 ; 7 6 5 4 + jmp m(iadst_8x16_internal_10bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x16_internal_10bpc).load2 + jmp m(idct_8x16_internal_10bpc).pass1_end +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + pmulhrsw m4, m8, m0 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + vpbroadcastd m7, [o(pw_2048)] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + vpbroadcastd m6, [o(pixel_10bpc_max)] + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m1 + punpckhdq m4, m1 + pxor m5, m5 + punpckhqdq m1, m0, m2 ; 1 5 9 13 + punpcklqdq m0, m2 ; 0 4 8 12 + punpcklqdq m2, m3, m4 ; 2 6 10 14 + punpckhqdq m3, m4 ; 3 7 11 15 + lea r6, [strideq*3] + pmulhrsw m0, m7 + call .write_8x4_start + pmulhrsw m0, m7, m1 + call .write_8x4 + pmulhrsw m0, m7, m2 + call .write_8x4 + pmulhrsw m0, m7, m3 +.write_8x4: + add dstq, strideq + add cq, 64*2 +.write_8x4_start: + mova xm4, [dstq+strideq*0] + vinserti32x4 ym4, [dstq+strideq*4], 1 + vinserti32x4 m4, [dstq+strideq*8], 2 + vinserti32x4 m4, [dstq+r6*4 ], 3 + mova [cq+64*0], m5 + mova [cq+64*1], m5 + paddw m4, m0 + pmaxsw m4, m5 + pminsw m4, m6 + mova [dstq+strideq*0], xm4 + vextracti32x4 [dstq+strideq*4], ym4, 1 + vextracti32x4 [dstq+strideq*8], m4, 2 + vextracti32x4 [dstq+r6*4 ], m4, 3 + ret + +%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 16x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 +.dconly: + vpbroadcastd m2, [o(dconly_10bpc)] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw m1, r6d + paddsw m1, m2 +.dconly_loop: + mova ym0, [dstq+strideq*0] + vinserti32x8 m0, [dstq+strideq*1], 1 + paddsw m0, m1 + psubusw m0, m2 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity, -21 +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, adst + +cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + pmulld m4, m12, [cq+64*0] ; 0 1 + pmulld m9, m12, [cq+64*1] ; 2 3 + pmulld m8, m12, [cq+64*2] ; 4 5 + pmulld m7, m12, [cq+64*3] ; 6 7 + vpbroadcastd m13, [o(pd_2048)] + pxor m2, m2 + mova m15, [o(permB)] + REPX {mova [cq+64*x], m2}, 0, 1, 2, 3 + psrlq m0, m15, 32 + REPX {paddd x, m13}, m4, m9, m8, m7 + vpbroadcastd m14, [o(clip_18b_min)] + REPX {psrad x, 12 }, m4, m8, m9, m7 + mova m1, m0 + vpermi2q m0, m4, m8 ; 0 4 + cmp eobd, 43 + jl .fast + pmulld m5, m12, [cq+64*4] ; 8 9 + pmulld m10, m12, [cq+64*5] ; 10 11 + pmulld m11, m12, [cq+64*6] ; 12 13 + pmulld m6, m12, [cq+64*7] ; 14 15 + REPX {mova [cq+64*x], m2}, 4, 5, 6, 7 + REPX {paddd x, m13}, m5, m10, m11, m6 + REPX {psrad x, 12 }, m10, m5, m11, m6 + mova m2, m1 + vpermi2q m1, m9, m10 ; 2 10 + mova m3, m2 + vpermi2q m2, m5, m11 ; 8 12 + vpermi2q m3, m6, m7 ; 14 6 + vpermt2q m4, m15, m11 ; 1 13 + vpermt2q m6, m15, m9 ; 15 3 + vpermt2q m5, m15, m8 ; 9 5 + vpermt2q m7, m15, m10 ; 7 11 + vpbroadcastd m15, [o(clip_18b_max)] + call m(idct_8x8_internal_10bpc).main + call .main + jmp .pass1_end +.fast: + vpermi2q m1, m9, m7 ; 2 6 + vpermt2q m4, m15, m9 ; 1 3 + vpermt2q m7, m15, m8 ; 7 5 + vpbroadcastd m15, [o(clip_18b_max)] + call m(idct_8x8_internal_10bpc).main_fast + call .main_fast +.pass1_end: + call m(idct_8x16_internal_10bpc).main_end + mova m8, [o(permA)] + psrlq m9, m8, 8 +.pass1_end2: + mova m10, m9 + mova m11, m8 + call .transpose_16x8 + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + call m(idct_16x8_internal_8bpc).main + movshdup m4, [permC] + vpbroadcastd m13, [pw_2048] + psrlq m5, m4, 8 + vpermq m0, m4, m0 + vpermq m1, m5, m1 + vpermq m2, m4, m2 + vpermq m3, m5, m3 +.end: + vpbroadcastd m15, [pixel_10bpc_max] + pxor m14, m14 + pmulhrsw m8, m13, m0 + pmulhrsw m9, m13, m1 + lea r6, [strideq*3] + call .write_16x4 + pmulhrsw m8, m13, m2 + pmulhrsw m9, m13, m3 +.write_16x4: + mova ym10, [dstq+strideq*0] + vinserti32x8 m10, [dstq+strideq*1], 1 + paddw m8, m10 + mova ym10, [dstq+strideq*2] + vinserti32x8 m10, [dstq+r6 ], 1 + paddw m9, m10 + pmaxsw m8, m14 + pmaxsw m9, m14 + pminsw m8, m15 + pminsw m9, m15 + mova [dstq+strideq*0], ym8 + vextracti32x8 [dstq+strideq*1], m8, 1 + mova [dstq+strideq*2], ym9 + vextracti32x8 [dstq+r6 ], m9, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.main_fast2: ; bottom three-quarters are zero + vbroadcasti32x4 m6, [o(pd_4076_3920)] + vbroadcasti32x4 m3, [o(pd_401_m1189)] + pmulld m6, m4 ; t15 t12 + pmulld m4, m3 ; t9 t10 + REPX {paddd x, m13}, m6, m4 + REPX {psrad x, 12 }, m6, m4 + mova m5, m6 ; t14 t13 + mova m9, m4 ; t8 t11 + jmp .main3 +.main_fast: ; bottom half is zero + vbroadcasti32x4 m6, [o(pd_4076_3920)] + vbroadcasti32x4 m3, [o(pd_401_m1189)] + vbroadcasti32x4 m5, [o(pd_m2598_1931)] + vbroadcasti32x4 m9, [o(pd_3166_3612)] + pmulld m6, m4 ; t15a t12a + pmulld m4, m3 ; t8a t11a + pmulld m5, m7 ; t9a t10a + pmulld m7, m9 ; t14a t13a + jmp .main2 +.main: + ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189 + ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612 +.main2: + REPX {paddd x, m13}, m4, m6, m5, m7 + REPX {psrad x, 12 }, m4, m5, m6, m7 + paddd m9, m4, m5 ; t8 t11 + psubd m4, m5 ; t9 t10 + psubd m5, m6, m7 ; t14 t13 + paddd m6, m7 ; t15 t12 + REPX {pmaxsd x, m14}, m5, m4, m9, m6 + REPX {pminsd x, m15}, m5, m4, m9, m6 +.main3: + psubd m3, m0, m1 ; dct8 out7 out6 + paddd m0, m1 ; dct8 out0 out1 + vbroadcasti32x4 m7, [o(pd_3784_m3784)] + pmulld m7, m5 + vpmulld m5, [o(pd_1567)] {1to16} + paddd m1, m2, m8 ; dct8 out3 out2 + psubd m2, m8 ; dct8 out4 out5 + vbroadcasti32x4 m8, [o(pd_1567_m1567)] + pmulld m8, m4 + vpmulld m4, [o(pd_3784)] {1to16} + REPX {pmaxsd x, m14}, m0, m1 + REPX {pminsd x, m15}, m0, m1 + paddd m7, m13 + paddd m5, m13 + paddd m7, m8 + psubd m5, m4 + psrad m7, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a + punpckhqdq m4, m9, m7 + punpcklqdq m8, m9, m5 + punpckhqdq m5, m6, m5 + punpcklqdq m6, m7 + psubd m7, m8, m4 ; t11a t10 + paddd m8, m4 ; t8a t9 + psubd m4, m6, m5 ; t12a t13 + paddd m6, m5 ; t15a t14 + REPX {pmaxsd x, m14}, m4, m7 + REPX {pminsd x, m15}, m4, m7 + pmulld m4, m12 + pmulld m7, m12 + REPX {pmaxsd x, m14}, m2, m3, m6, m8 + REPX {pminsd x, m15}, m2, m3, m6, m8 + paddd m4, m13 + paddd m5, m4, m7 + psubd m4, m7 + psrad m4, 12 ; t11 t10a + psrad m5, 12 ; t12 t13a + ret +ALIGN function_align +.transpose_16x8: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpermi2d m8, m0, m2 + vpermt2d m0, m9, m2 + vpermi2d m10, m1, m3 + vpermi2d m11, m1, m3 + punpckhwd m3, m8, m0 + punpcklwd m1, m8, m0 + punpckhwd m4, m10, m11 + punpcklwd m2, m10, m11 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, identity, -21 +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, adst + +cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + call .main_pass1 + vpbroadcastd m9, [o(pd_1)] + paddd m0, m9 + psubd m1, m9, m1 + paddd m2, m9 + psubd m3, m9, m3 + paddd m4, m9, m5 + psubd m5, m9, m6 + paddd m6, m9, m7 + psubd m7, m9, m8 +.pass1_end: + mova m9, [o(permA)] + psrlq m8, m9, 8 + REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7 + jmp m(idct_16x8_internal_10bpc).pass1_end2 +.pass2: + call .main_pass2 + vpermq m8, m13, m0 + vpermq m9, m13, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m13, m2 + vpermq m9, m13, m3 + jmp m(idct_16x8_internal_10bpc).write_16x4 +ALIGN function_align +.main_pass1: + vpbroadcastd m12, [o(pd_2896)] + pmulld m2, m12, [cq+64*0] + pmulld m7, m12, [cq+64*1] + pmulld m1, m12, [cq+64*2] + pmulld m5, m12, [cq+64*3] + vpbroadcastd m13, [o(pd_2048)] + pxor m4, m4 + mova m10, [o(permB)] + REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 + REPX {paddd x, m13}, m2, m7, m1, m5 + psrlq m6, m10, 32 + REPX {psrad x, 12 }, m2, m7, m1, m5 + mova m0, m6 + vpermi2q m0, m2, m7 ; 0 2 + vpermt2q m7, m10, m2 ; 3 1 + mova m2, m6 + vpermi2q m2, m1, m5 ; 4 6 + vpermt2q m5, m10, m1 ; 7 5 + cmp eobd, 43 + jl .main_fast + pmulld m8, m12, [cq+64*4] + pmulld m3, m12, [cq+64*5] + pmulld m9, m12, [cq+64*6] + pmulld m1, m12, [cq+64*7] + REPX {mova [cq+64*x], m4}, 4, 5, 6, 7 + REPX {paddd x, m13}, m8, m3, m9, m1 + REPX {psrad x, 12 }, m8, m3, m9, m1 + mova m4, m6 + vpermi2q m4, m8, m3 ; 8 10 + vpermt2q m3, m10, m8 ; 11 9 + vpermi2q m6, m9, m1 ; 12 14 + vpermt2q m1, m10, m9 ; 15 13 +.main: + ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601 + jmp .main2 +.main_fast: + vbroadcasti32x4 m1, [o(pd_4091_3973)] + vbroadcasti32x4 m8, [o(pd_201_995)] + vbroadcasti32x4 m3, [o(pd_3703_3290)] + vbroadcasti32x4 m9, [o(pd_1751_2440)] + vbroadcasti32x4 m4, [o(pd_2751_2106)] + vbroadcasti32x4 m10, [o(pd_3035_3513)] + vbroadcasti32x4 m6, [o(pd_1380_601)] + vbroadcasti32x4 m11, [o(pd_3857_4052)] + pmulld m1, m0 + pmulld m0, m8 + pmulld m3, m2 + pmulld m2, m9 + pmulld m4, m5 + pmulld m5, m10 + pmulld m6, m7 + pmulld m7, m11 +.main2: + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + REPX {psubd x, m13, x}, m1, m3 + REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7 + psubd m8, m0, m4 ; t8a t10a + paddd m0, m4 ; t0a t2a + psubd m4, m1, m5 ; t9a t11a + paddd m1, m5 ; t1a t3a + psubd m5, m2, m6 ; t12a t14a + paddd m2, m6 ; t4a t6a + psubd m6, m3, m7 ; t13a t15a + paddd m3, m7 ; t5a t7a + REPX {pmaxsd x, m14}, m8, m4, m5, m6 + REPX {pminsd x, m15}, m8, m4, m5, m6 + vbroadcasti32x4 m11, [o(pd_4017_2276)] + vbroadcasti32x4 m10, [o(pd_799_3406)] + ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11 + ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10 + REPX {pmaxsd x, m14}, m0, m2, m1, m3 + REPX {pminsd x, m15}, m0, m2, m1, m3 + psubd m7, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + psubd m3, m4, m6 ; t12a t14a + paddd m4, m6 ; t8a t10a + psubd m6, m8, m5 ; t13a t15a + paddd m8, m5 ; t9a t11a + REPX {pmaxsd x, m14}, m7, m3, m2, m6 + REPX {pminsd x, m15}, m7, m3, m2, m6 + punpcklqdq m5, m3, m7 ; t12a t4 + punpckhqdq m3, m7 ; t14a t6 + punpckhqdq m7, m6, m2 ; t15a t7 + punpcklqdq m6, m2 ; t13a t5 + vpbroadcastd m11, [o(pd_1567)] + vpbroadcastd m10, [o(pd_3784)] + ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11 + ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10 + REPX {pmaxsd x, m14}, m0, m4, m1, m8 + REPX {pminsd x, m15}, m0, m4, m1, m8 + punpckhqdq m2, m4, m0 ; t10a t2 + punpcklqdq m4, m0 ; t8a t0 + punpckhqdq m0, m8, m1 ; t11a t3 + punpcklqdq m8, m1 ; t9a t1 + paddd m1, m6, m7 ; out2 -out3 + psubd m6, m7 ; t14a t6 + paddd m7, m5, m3 ; -out13 out12 + psubd m5, m3 ; t15a t7 + psubd m3, m8, m0 ; t11 t3a + paddd m8, m0 ; out14 -out15 + paddd m0, m4, m2 ; -out1 out0 + psubd m4, m2 ; t10 t2a + REPX {pmaxsd x, m14}, m6, m5, m3, m4 + mov r6d, 0x3333 + REPX {pminsd x, m15}, m6, m5, m3, m4 + kmovw k1, r6d + REPX {pmulld x, m12}, m6, m5, m3, m4 + pxor m9, m9 + REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8 + paddd m6, m13 + paddd m4, m13 + paddd m2, m6, m5 ; -out5 out4 + psubd m6, m5 ; out10 -out11 + psubd m5, m4, m3 ; -out9 out8 + paddd m3, m4 ; out6 -out7 + REPX {psrad x, 12}, m2, m3, m5, m6 + REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6 + ret +ALIGN function_align +.main_pass2: + lea r5, [o_base_8bpc] + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_16x8_internal_8bpc).main_pass2 + movshdup m13, [permC] + pmulhrsw m0, m6 + pmulhrsw m1, m6 + vpbroadcastd m15, [pixel_10bpc_max] + pxor m14, m14 + lea r6, [strideq*3] + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, identity, -21 +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst + +cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(iadst_16x8_internal_10bpc).main_pass1 + vpbroadcastd m9, [o(pd_1)] + psubd m4, m9, m3 + paddd m3, m9, m5 + paddd m5, m9, m2 + psubd m2, m9, m6 + psubd m6, m9, m1 + paddd m1, m9, m7 + paddd m7, m9, m0 + psubd m0, m9, m8 + jmp m(iadst_16x8_internal_10bpc).pass1_end +.pass2: + call m(iadst_16x8_internal_10bpc).main_pass2 + psrlq m13, 8 + vpermq m8, m13, m3 + vpermq m9, m13, m2 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m13, m1 + vpermq m9, m13, m0 + jmp m(idct_16x8_internal_10bpc).write_16x4 + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x16_internal_10bpc).load2 + vpbroadcastd m8, [o(pd_5793)] + vpbroadcastd m9, [o(pd_3072)] + pxor m10, m10 + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m10}, 0, 1, 2, 3 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m10}, 4, 5, 6, 7 + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + psrlq m8, [o(permA)], 16 + psrlq m9, m8, 8 + mova m10, m8 + mova m11, m9 + call m(idct_16x8_internal_10bpc).transpose_16x8 + jmp tx2q +.pass2: + movshdup m4, [o(permC)] + vpbroadcastd m13, [o(pw_4096)] + REPX {vpermq x, m4, x}, m0, m1, m2, m3 + jmp m(idct_16x8_internal_10bpc).end + +%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 16x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity, 28 +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, adst + +cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 36 + jl .fast + mova m0, [cq+64* 0] + mova m1, [cq+64* 2] + mova m2, [cq+64* 4] + mova m3, [cq+64* 6] + mova m4, [cq+64* 8] + mova m5, [cq+64*10] + mova m6, [cq+64*12] + mova m7, [cq+64*14] +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + call m(idct_8x16_internal_10bpc).main + mova m16, [cq+64* 1] + mova m17, [cq+64* 3] + mova m18, [cq+64* 5] + mova m19, [cq+64* 7] + mova m20, [cq+64* 9] + mova m21, [cq+64*11] + mova m22, [cq+64*13] + mova m23, [cq+64*15] + call .main + call .main_end +.pass1_end: +%if WIN64 + movaps xmm6, [cq+16*0] + movaps xmm7, [cq+16*1] +%endif + vzeroupper +.pass1_end2: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + punpckhdq m7, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m3, m6 + punpckldq m3, m6 + vshufi32x4 m6, m0, m4, q3232 + vinserti32x8 m0, ym4, 1 + vinserti32x8 m4, m8, ym3, 1 + vshufi32x4 m8, m3, q3232 + vinserti32x8 m3, m7, ym1, 1 + vshufi32x4 m7, m1, q3232 + vshufi32x4 m1, m2, m5, q3232 + vinserti32x8 m2, ym5, 1 + vshufi32x4 m5, m7, m1, q2020 ; 10 11 + vshufi32x4 m7, m1, q3131 ; 14 15 + vshufi32x4 m1, m3, m2, q2020 ; 2 3 + vshufi32x4 m3, m2, q3131 ; 6 7 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 +.pass1_end3: + mov r6d, 64*12 + pxor m8, m8 +.zero_loop: + mova [cq+r6+64*3], m8 + mova [cq+r6+64*2], m8 + mova [cq+r6+64*1], m8 + mova [cq+r6+64*0], m8 + sub r6d, 64*4 + jge .zero_loop + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + call m(idct_16x16_internal_8bpc).main + movshdup m10, [permC] + vpbroadcastd m13, [pw_2048] + psrlq m11, m10, 8 + vpermq m8, m10, m0 + vpermq m0, m11, m7 + vpermq m7, m11, m1 + vpermq m1, m10, m6 + vpermq m6, m10, m2 + vpermq m2, m11, m5 + vpermq m5, m11, m3 + vpermq m3, m10, m4 +.pass2_end: + lea r6, [strideq*3] + vpbroadcastd m15, [pixel_10bpc_max] + pxor m14, m14 + pmulhrsw m8, m13, m8 + pmulhrsw m9, m13, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m8, m13, m6 + pmulhrsw m9, m13, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m8, m13, m3 + pmulhrsw m9, m13, m2 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m8, m13, m1 + pmulhrsw m9, m13, m0 + jmp m(idct_16x8_internal_10bpc).write_16x4 +.fast: + mova ym0, [cq+64*0] + mova ym2, [cq+64*4] + movshdup m8, [o(permB)] + mova ym1, [cq+64*2] + mova ym3, [cq+64*6] + mova ym4, [cq+64*1] + mova ym5, [cq+64*3] + mova ym6, [cq+64*5] + mova ym7, [cq+64*7] + vpermt2q m0, m8, m2 ; 0 4 + vpermt2q m1, m8, m3 ; 2 6 + vpermt2q m4, m8, m5 ; 1 3 + vpermt2q m7, m8, m6 ; 7 5 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + vpbroadcastd m11, [o(pd_2)] + call m(idct_8x16_internal_10bpc).main_end2 + mova m8, [o(permA)] + psrlq m9, m8, 8 + jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2 +ALIGN function_align +.main: + ITX_MULSUB_2D 16, 23, 7, 9, 10, 13, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 20, 19, 7, 9, 10, 13, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2D 22, 17, 7, 9, 10, 13, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 18, 21, 7, 9, 10, 13, 1931, 3612 ; t10a, t13a + paddd m9, m20, m16 ; t8 + psubd m20, m16, m20 ; t9 + psubd m16, m22, m18 ; t10 + paddd m18, m22 ; t11 + paddd m22, m23, m19 ; t15 + psubd m23, m19 ; t14 + psubd m19, m17, m21 ; t13 + paddd m17, m21 ; t12 + vpbroadcastd m11, [o(pd_3784)] + REPX {pmaxsd x, m14}, m20, m23, m16, m19 + vpbroadcastd m10, [o(pd_1567)] + REPX {pminsd x, m15}, m20, m23, m16, m19 + ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11 + ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m9, m18, m22, m17 + REPX {pminsd x, m15}, m9, m18, m22, m17 + paddd m21, m20, m19 ; t14 + psubd m20, m19 ; t13 + psubd m19, m9, m18 ; t11a + paddd m9, m18 ; t8a + psubd m18, m23, m16 ; t10 + paddd m16, m23 ; t9 + psubd m23, m22, m17 ; t12a + paddd m22, m17 ; t15a + REPX {pmaxsd x, m14}, m20, m23, m18, m19 + REPX {pminsd x, m15}, m20, m23, m18, m19 + REPX {pmulld x, m12}, m20, m23, m18, m19 + psubd m7, m0, m6 ; dct8 out7 + paddd m0, m6 ; dct8 out0 + psubd m6, m1, m5 ; dct8 out6 + paddd m1, m5 ; dct8 out1 + REPX {pmaxsd x, m14}, m7, m0, m6, m1 + psubd m5, m2, m4 ; dct8 out5 + paddd m2, m4 ; dct8 out2 + REPX {pminsd x, m15}, m7, m0, m6, m1 + psubd m4, m3, m8 ; dct8 out4 + paddd m3, m8 ; dct8 out3 + REPX {pmaxsd x, m14}, m5, m2, m4, m3 + paddd m20, m13 + paddd m23, m13 + REPX {pminsd x, m15}, m5, m2, m4, m3 + psubd m17, m20, m18 ; t10a + paddd m20, m18 ; t13a + REPX {pmaxsd x, m14}, m22, m21, m16, m9 + psubd m18, m23, m19 ; t11 + paddd m19, m23 ; t12 + REPX {pminsd x, m15}, m22, m21, m16, m9 + REPX {psrad x, 12 }, m20, m19, m18, m17 + ret +.main_end: + vpbroadcastd m11, [o(pd_2)] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m23, m0, m22 ; out15 + paddd m0, m22 ; out0 + psubd m22, m1, m21 ; out14 + paddd m1, m21 ; out1 + psubd m21, m2, m20 ; out13 + paddd m2, m20 ; out2 + psubd m20, m3, m19 ; out12 + paddd m3, m19 ; out3 + psubd m19, m4, m18 ; out11 + paddd m4, m18 ; out4 + psubd m18, m5, m17 ; out10 + paddd m5, m17 ; out5 + psubd m17, m6, m16 ; out9 + paddd m6, m16 ; out6 + psubd m16, m7, m9 ; out8 + paddd m7, m9 ; out7 + REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \ + m4, m20, m5, m21, m6, m22, m7, m23 + packssdw m0, m16 + packssdw m1, m17 + packssdw m2, m18 + packssdw m3, m19 + packssdw m4, m20 + packssdw m5, m21 + packssdw m6, m22 + packssdw m7, m23 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, flipadst +INV_TXFM_16X16_FN adst, adst + +cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 36 + jl .fast + call .main_pass1 + packssdw m0, m16 + packssdw m1, m17 + packssdw m2, m18 + packssdw m3, m19 + packssdw m4, m5, m20 + packssdw m5, m6, m21 + packssdw m6, m7, m22 + packssdw m7, m8, m23 + jmp m(idct_16x16_internal_10bpc).pass1_end +.fast: + call .main_pass1_fast + vpbroadcastd m9, [o(pd_2)] + paddd m0, m9 + psubd m1, m9, m1 + paddd m2, m9 + psubd m3, m9, m3 + paddd m4, m9, m5 + psubd m5, m9, m6 + paddd m6, m9, m7 + psubd m7, m9, m8 +.pass1_fast_end: + mova m9, [o(permA)] + psrlq m8, m9, 8 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 +.pass1_fast_end2: + mova m10, m9 + mova m11, m8 + call m(idct_16x8_internal_10bpc).transpose_16x8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + call m(iadst_16x16_internal_8bpc).main_pass2b + movshdup m10, [permC] + mova m13, [pw_2048_m2048] + psrlq m11, m10, 8 + vpermq m8, m11, m0 + vpermq m0, m10, m7 + vpermq m7, m11, m1 + vpermq m1, m10, m6 + vpermq m6, m11, m2 + vpermq m2, m10, m5 + vpermq m5, m11, m3 + vpermq m3, m10, m4 + jmp m(idct_16x16_internal_10bpc).pass2_end +ALIGN function_align +.main_pass1: + mova m0, [cq+64* 0] +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + mova m23, [cq+64*15] + vpbroadcastd m13, [o(pd_2048)] + ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0 + mova m7, [cq+64* 7] + mova m16, [cq+64* 8] + ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8 + mova m2, [cq+64* 2] + mova m21, [cq+64*13] + ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2 + mova m5, [cq+64* 5] + mova m18, [cq+64*10] + ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10 + mova m4, [cq+64* 4] + mova m19, [cq+64*11] + ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4 + mova m3, [cq+64* 3] + mova m20, [cq+64*12] + ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12 + mova m6, [cq+64* 6] + mova m17, [cq+64* 9] + ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6 + mova m1, [cq+64* 1] + mova m22, [cq+64*14] + ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14 + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psubd m9, m23, m7 ; t9a + paddd m23, m7 ; t1a + psubd m7, m2, m18 ; t10a + paddd m18, m2 ; t2a + REPX {pmaxsd x, m14}, m9, m23, m7, m18 + psubd m2, m17, m1 ; t15a + paddd m17, m1 ; t7a + REPX {pminsd x, m15}, m9, m23, m7, m18 + psubd m1, m21, m5 ; t11a + paddd m21, m5 ; t3a + REPX {pmaxsd x, m14}, m2, m17, m1, m21 + psubd m5, m4, m20 ; t12a + paddd m4, m20 ; t4a + REPX {pminsd x, m15}, m2, m17, m1, m21 + psubd m20, m19, m3 ; t13a + paddd m19, m3 ; t5a + REPX {pmaxsd x, m14}, m5, m4, m20, m19 + psubd m8, m6, m22 ; t14a + paddd m6, m22 ; t6a + REPX {pminsd x, m15}, m5, m4, m20, m19 + psubd m22, m0, m16 ; t8a + paddd m16, m0 ; t0a + REPX {pmaxsd x, m14}, m8, m6, m22, m16 + vpbroadcastd m11, [o(pd_4017)] + vpbroadcastd m10, [o(pd_799)] + REPX {pminsd x, m15}, m8, m6, m22, m16 + ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8 + ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13 + vpbroadcastd m11, [o(pd_2276)] + vpbroadcastd m10, [o(pd_3406)] + ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10 + ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15 + paddd m0, m16, m4 ; t0 + psubd m16, m4 ; t4 + psubd m3, m23, m19 ; t5 + paddd m23, m19 ; t1 + REPX {pmaxsd x, m14}, m0, m16, m3, m23 + psubd m19, m18, m6 ; t6 + paddd m18, m6 ; t2 + REPX {pminsd x, m15}, m0, m16, m3, m23 + psubd m6, m21, m17 ; t7 + paddd m21, m17 ; t3 + REPX {pmaxsd x, m14}, m19, m18, m6, m21 + paddd m17, m9, m20 ; t8a + psubd m9, m20 ; t12a + REPX {pminsd x, m15}, m19, m18, m6, m21 + psubd m20, m22, m5 ; t13a + paddd m22, m5 ; t9a + REPX {pmaxsd x, m14}, m17, m9, m20, m22 + psubd m5, m1, m2 ; t14a + paddd m1, m2 ; t10a + REPX {pminsd x, m15}, m17, m9, m20, m22 + psubd m2, m7, m8 ; t15a + paddd m7, m8 ; t11a + REPX {pmaxsd x, m14}, m5, m1, m2, m7 + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + REPX {pminsd x, m15}, m5, m1, m2, m7 + ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a + ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a + ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12 + ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15 + psubd m8, m0, m18 ; t2a + paddd m0, m18 ; out0 + psubd m18, m23, m21 ; t3a + paddd m23, m21 ; -out15 + paddd m21, m9, m5 ; -out13 + psubd m9, m5 ; t15a + psubd m5, m3, m6 ; t6 + paddd m3, m6 ; -out3 + REPX {pmaxsd x, m14}, m8, m18, m9, m5 + psubd m6, m20, m2 ; t14a + paddd m2, m20 ; out2 + paddd m20, m16, m19 ; out12 + psubd m16, m19 ; t7 + REPX {pminsd x, m15}, m8, m18, m9, m5 + psubd m19, m22, m7 ; t11 + paddd m22, m7 ; out14 + psubd m7, m17, m1 ; t10 + paddd m1, m17 ; -out1 + REPX {pmaxsd x, m14}, m6, m16, m19, m7 + vpbroadcastd m12, [o(pd_1448)] + vpbroadcastd m4, [o(pd_2)] + vpbroadcastd m10, [o(pd_5120)] + vpbroadcastd m11, [o(pd_5119)] + REPX {pminsd x, m15}, m6, m16, m19, m7 + psubd m17, m7, m19 ; -out9 + paddd m7, m19 ; out6 + psubd m19, m5, m16 ; -out11 + paddd m5, m16 ; out4 + REPX {pmulld x, m12}, m17, m7, m19, m5 + psubd m16, m8, m18 ; out8 + paddd m8, m18 ; -out7 + psubd m18, m6, m9 ; out10 + paddd m6, m9 ; -out5 + REPX {pmulld x, m12}, m16, m8, m18, m6 + REPX {paddd x, m4 }, m0, m2, m20, m22 + REPX {psubd x, m4, x}, m1, m3, m21, m23 + REPX {paddd x, m10 }, m7, m5, m16, m18 + REPX {psubd x, m11, x}, m17, m19, m8, m6 + REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3 + REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8 + ret +ALIGN function_align +.main_pass1_fast: + mova ym0, [cq+64*0] + mova ym1, [cq+64*2] + movshdup m8, [o(permB)] + mova ym6, [cq+64*1] + mova ym7, [cq+64*3] + mova ym2, [cq+64*4] + mova ym3, [cq+64*6] + mova ym4, [cq+64*5] + mova ym5, [cq+64*7] + vpermt2q m0, m8, m1 ; 0 2 + vpermt2q m7, m8, m6 ; 3 1 + vpermt2q m2, m8, m3 ; 4 6 + vpermt2q m5, m8, m4 ; 7 5 + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m12, [o(pd_2896)] + jmp m(iadst_16x8_internal_10bpc).main_fast + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 36 + jl .fast + call m(iadst_16x16_internal_10bpc).main_pass1 + packssdw m4, m19, m3 + packssdw m3, m20, m5 + packssdw m5, m18, m2 + packssdw m2, m21, m6 + packssdw m6, m17, m1 + packssdw m1, m22, m7 + packssdw m7, m16, m0 + packssdw m0, m23, m8 + jmp m(idct_16x16_internal_10bpc).pass1_end +.fast: + call m(iadst_16x16_internal_10bpc).main_pass1_fast + vpbroadcastd m9, [o(pd_2)] + psubd m4, m9, m3 + paddd m3, m9, m5 + paddd m5, m9, m2 + psubd m2, m9, m6 + psubd m6, m9, m1 + paddd m1, m9, m7 + paddd m7, m9, m0 + psubd m0, m9, m8 + jmp m(iadst_16x16_internal_10bpc).pass1_fast_end +.pass2: + lea r5, [o_base_8bpc] + call m(iadst_16x16_internal_8bpc).main_pass2b + movshdup m10, [permC] + movu m13, [pw_m2048_2048] + psrlq m11, m10, 8 + vpermq m8, m11, m7 + vpermq m7, m11, m6 + vpermq m6, m11, m5 + vpermq m5, m11, m4 + vpermq m3, m10, m3 + vpermq m2, m10, m2 + vpermq m1, m10, m1 + vpermq m0, m10, m0 + jmp m(idct_16x16_internal_10bpc).pass2_end + +INV_TXFM_16X16_FN identity, dct, -92 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m10, [o(pd_5793)] + vpbroadcastd m11, [o(pd_5120)] + mov r6, cq + cmp eobd, 36 + jl .fast + call .pass1_main + packssdw m0, m6, m8 + packssdw m1, m7, m9 + call .pass1_main + packssdw m2, m6, m8 + packssdw m3, m7, m9 + call .pass1_main + packssdw m4, m6, m8 + packssdw m5, m7, m9 + call .pass1_main + packssdw m6, m8 + packssdw m7, m9 + jmp m(idct_16x16_internal_10bpc).pass1_end2 +.fast: + call .pass1_main_fast + packssdw m0, m6, m7 + call .pass1_main_fast + packssdw m1, m6, m7 + call .pass1_main_fast + packssdw m2, m6, m7 + call .pass1_main_fast + packssdw m3, m6, m7 + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckldq m3, m4, m1 + punpckhdq m4, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + pxor m7, m7 + vshufi32x4 m2, m0, m3, q3131 + vshufi32x4 m0, m3, q2020 + vshufi32x4 m3, m1, m4, q3131 + vshufi32x4 m1, m4, q2020 + REPX {mova x, m7}, m4, m5, m6 + jmp m(idct_16x16_internal_10bpc).pass1_end3 +.pass2: + movshdup m11, [o(permC)] + vpbroadcastd m12, [o(pw_1697x16)] + lea r6, [strideq*3] + vpbroadcastd m13, [o(pw_2048)] + pxor m14, m14 + vpbroadcastd m15, [pixel_10bpc_max] + vpermq m8, m11, m0 + vpermq m9, m11, m1 + call .pass2_main + vpermq m8, m11, m2 + vpermq m9, m11, m3 + call .pass2_main + vpermq m8, m11, m4 + vpermq m9, m11, m5 + call .pass2_main + vpermq m8, m11, m6 + vpermq m9, m11, m7 +.pass2_main: + pmulhrsw m0, m12, m8 + pmulhrsw m1, m12, m9 + paddsw m8, m8 + paddsw m9, m9 + paddsw m8, m0 + paddsw m9, m1 + pmulhrsw m8, m13 + pmulhrsw m9, m13 + jmp m(idct_16x8_internal_10bpc).write_16x4 +ALIGN function_align +.pass1_main: + pmulld m6, m10, [r6+64*0] + pmulld m7, m10, [r6+64*1] + pmulld m8, m10, [r6+64*8] + pmulld m9, m10, [r6+64*9] + add r6, 64*2 + REPX {paddd x, m11}, m6, m7, m8, m9 + REPX {psrad x, 13 }, m6, m8, m7, m9 + ret +ALIGN function_align +.pass1_main_fast: + mova ym6, [r6+64* 0] + vinserti32x8 m6, [r6+64* 4], 1 + mova ym7, [r6+64* 8] + vinserti32x8 m7, [r6+64*12], 1 + add r6, 64 + REPX {pmulld x, m10}, m6, m7 + REPX {paddd x, m11}, m6, m7 + REPX {psrad x, 13 }, m6, m7 + ret + +cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + vpbroadcastd m11, [o(pd_2)] + mova m20, [o(idct8x32p)] + pxor m21, m21 + cmp eobd, 43 + jl .fast + call .pass1_main + punpcklwd m16, m0, m1 + punpcklwd m17, m2, m3 + punpckhwd m18, m0, m1 + punpckhwd m19, m2, m3 + cmp eobd, 107 + jge .full + punpckldq m0, m16, m17 ; 0 2 + punpckhdq m1, m16, m17 ; 4 6 + punpckldq m2, m18, m19 ; 8 10 + punpckhdq m3, m18, m19 ; 12 14 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + vextracti32x8 ym16, m2, 1 + vextracti32x8 ym17, m3, 1 + call m(idct_8x16_internal_8bpc).main_fast + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .end +.full: + add cq, 64 + call .pass1_main + punpcklwd m5, m0, m1 + punpcklwd m6, m2, m3 + punpckhwd m7, m0, m1 + punpckhwd m8, m2, m3 + punpckldq m0, m16, m17 ; 0 2 + punpckhdq m1, m16, m17 ; 4 6 + punpckldq m2, m18, m19 ; 8 10 + punpckhdq m3, m18, m19 ; 12 14 + punpckldq m4, m5, m6 ; 16 18 + punpckhdq m5, m6 ; 20 22 + punpckldq m6, m7, m8 ; 24 26 + punpckhdq m7, m8 ; 28 30 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + vextracti32x8 ym16, m2, 1 + vextracti32x8 ym17, m3, 1 + vextracti32x8 ym18, m4, 1 + vextracti32x8 ym19, m5, 1 + vextracti32x8 ym20, m6, 1 + vextracti32x8 ym21, m7, 1 + call m(idct_8x16_internal_8bpc).main + REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main + jmp .end +.fast: + movshdup m8, [o(permB)] + mova ym1, [cq+128*1] + mova ym5, [cq+128*5] + mova ym7, [cq+128*3] + mova ym3, [cq+128*7] + mova ym0, [cq+128*0] + mova ym4, [cq+128*2] + mova ym2, [cq+128*4] + mova ym6, [cq+128*6] + vpermt2q m1, m8, m5 ; 1 5 + vpermt2q m3, m8, m7 ; 7 3 + vpermt2q m0, m8, m4 ; 0 2 + vpermt2q m2, m8, m6 ; 4 6 + mova [cq+128*0], ym21 + REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_10bpc).main_end + packssdw m0, m2 + packssdw m1, m3 + vpermb m0, m20, m0 + vprold m20, 16 + vpermb m2, m20, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + call m(idct_8x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2 +.end: + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper + lea r3, [strideq*2] + vpbroadcastd m12, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m11, m11 + lea r3, [dstq+r3*8] + pmulhrsw m0, m10 + pmulhrsw m1, m10 + call .write_8x4x2 + pmulhrsw m0, m10, m2 + pmulhrsw m1, m10, m3 + call .write_8x4x2 + pmulhrsw m0, m10, m4 + pmulhrsw m1, m10, m5 + call .write_8x4x2 + pmulhrsw m0, m10, m6 + pmulhrsw m1, m10, m7 +.write_8x4x2: + mova xm8, [dstq+strideq*0] + vinserti32x4 ym8, [dstq+strideq*1], 1 + vinserti32x4 m8, [dstq+strideq*2], 2 + vinserti32x4 m8, [dstq+r6 ], 3 + mova xm9, [r3 +r6 ] + vinserti32x4 ym9, [r3 +strideq*2], 1 + vinserti32x4 m9, [r3 +strideq*1], 2 + vinserti32x4 m9, [r3 +strideq*0], 3 + paddw m8, m0 + paddw m9, m1 + pmaxsw m8, m11 + pmaxsw m9, m11 + pminsw m8, m12 + pminsw m9, m12 + mova [dstq+strideq*0], xm8 + vextracti32x4 [dstq+strideq*1], ym8, 1 + vextracti32x4 [dstq+strideq*2], m8, 2 + vextracti32x4 [dstq+r6 ], m8, 3 + lea dstq, [dstq+strideq*4] + vextracti32x4 [r3 +strideq*0], m9, 3 + vextracti32x4 [r3 +strideq*1], m9, 2 + vextracti32x4 [r3 +strideq*2], ym9, 1 + mova [r3 +r6 ], xm9 + lea r3, [r3+strideq*4] + ret +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 +ALIGN function_align +.pass1_main: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x16_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_end2 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + REPX {vpermb x, m20, x}, m0, m1, m2, m3 + ret + +cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob + vpbroadcastd m9, [pw_5] + lea r4, [strideq*3] + pxor m10, m10 + lea r5, [strideq*5] + vpbroadcastd m11, [pixel_10bpc_max] + sub eobd, 107 + lea r6, [strideq+r4*2] +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + lea r7, [dstq+strideq*8] + REPX {mova [cq+128*x], m10}, 0, 1, 2, 3 + REPX {paddsw x, m9}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m10}, 4, 5, 6, 7 + REPX {psraw x, 3 }, m0, m1, m2, m3 + add cq, 64 + mova xm4, [dstq+strideq*0] + mova xm5, [dstq+strideq*1] + mova xm6, [dstq+strideq*2] + mova xm7, [dstq+r4 *1] + punpckhwd m8, m0, m1 + vinserti32x4 ym4, [dstq+strideq*4], 1 + punpcklwd m0, m1 + vinserti32x4 ym5, [dstq+r5 *1], 1 + punpckhwd m1, m2, m3 + vinserti32x4 ym6, [dstq+r4 *2], 1 + punpcklwd m2, m3 + vinserti32x4 ym7, [dstq+r6 *1], 1 + punpckhwd m3, m0, m8 + vinserti32x4 m4, [r7 +strideq*0], 2 + punpcklwd m0, m8 + vinserti32x4 m5, [r7 +strideq*1], 2 + punpckhwd m8, m2, m1 + vinserti32x4 m6, [r7 +strideq*2], 2 + punpcklwd m2, m1 + vinserti32x4 m7, [r7 +r4 *1], 2 + punpckhqdq m1, m0, m2 + vinserti32x4 m4, [r7 +strideq*4], 3 + punpcklqdq m0, m2 + vinserti32x4 m5, [r7 +r5 *1], 3 + punpcklqdq m2, m3, m8 + vinserti32x4 m6, [r7 +r4 *2], 3 + punpckhqdq m3, m8 + vinserti32x4 m7, [r7 +r6 *1], 3 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + REPX {pmaxsw x, m10}, m0, m1, m2, m3 + REPX {pminsw x, m11}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + mova [dstq+strideq*2], xm2 + mova [dstq+r4 *1], xm3 + vextracti32x4 [dstq+strideq*4], ym0, 1 + vextracti32x4 [dstq+r5 *1], ym1, 1 + vextracti32x4 [dstq+r4 *2], ym2, 1 + vextracti32x4 [dstq+r6 *1], ym3, 1 + lea dstq, [r7+strideq*8] + vextracti32x4 [r7 +strideq*0], m0, 2 + vextracti32x4 [r7 +strideq*1], m1, 2 + vextracti32x4 [r7 +strideq*2], m2, 2 + vextracti32x4 [r7 +r4 *1], m3, 2 + vextracti32x4 [r7 +strideq*4], m0, 3 + vextracti32x4 [r7 +r5 *1], m1, 3 + vextracti32x4 [r7 +r4 *2], m2, 3 + vextracti32x4 [r7 +r6 *1], m3, 3 + add eobd, 0x80000000 + jnc .loop + RET + +cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + mova m11, [o(permB)] + mova m0, [cq+64* 0] ; 0 1 + mova m4, [cq+64* 1] ; 2 3 + mova m1, [cq+64* 2] ; 4 5 + mova m8, [cq+64* 3] ; 6 7 + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psrlq m10, m11, 32 +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + mova m16, m11 + vpermi2q m16, m0, m1 ; 1 5 + mova m17, m11 + vpermi2q m17, m8, m4 ; 7 3 + cmp eobd, 43 + jl .fast + mova m18, [cq+64* 4] ; 8 9 + mova m20, [cq+64* 5] ; 10 11 + mova m6, [cq+64* 6] ; 12 13 + mova m7, [cq+64* 7] ; 14 15 + vpermt2q m0, m10, m18 ; 0 8 + vpermt2q m18, m11, m6 ; 9 13 + mova m19, m11 + vpermi2q m19, m7, m20 ; 15 11 + cmp eobd, 107 + jge .full + vpermt2q m1, m10, m6 ; 4 12 + vpermt2q m4, m10, m8 ; 2 6 + vpermt2q m7, m10, m20 ; 14 10 + mov r6d, 64*1 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + call .main_fast + call m(idct_16x16_internal_10bpc).main_end + jmp .end +.full: + mova m2, [cq+64* 8] ; 16 17 + mova m5, [cq+64* 9] ; 18 19 + mova m9, [cq+64*10] ; 20 21 + mova m21, [cq+64*11] ; 22 23 + vpermt2q m1, m10, m9 ; 4 20 + vpermt2q m7, m10, m21 ; 14 22 + vpermt2q m21, m11, m5 ; 23 19 + vpermt2q m5, m10, m20 ; 18 10 + mova m20, m11 + vpermi2q m20, m2, m9 ; 17 21 + mova m22, [cq+64*12] ; 24 25 + mova m9, [cq+64*13] ; 26 27 + mova m3, [cq+64*14] ; 28 29 + mova m23, [cq+64*15] ; 30 31 + vpermt2q m2, m10, m22 ; 16 24 + vpermt2q m22, m11, m3 ; 25 29 + vpermt2q m3, m10, m6 ; 28 12 + vpermt2q m4, m10, m9 ; 2 26 + mova m6, m10 + vpermi2q m6, m23, m8 ; 30 6 + vpermt2q m23, m11, m9 ; 31 27 + mov r6d, 64*3 + call m(idct_8x8_internal_10bpc).main + call m(idct_16x8_internal_10bpc).main + call .main + call m(idct_16x16_internal_10bpc).main_end + jmp .end +.fast: + vpermq m0, m10, m0 ; 0 0 + vpermq m1, m10, m1 ; 4 4 + vpermt2q m4, m10, m8 ; 2 6 + xor r6d, r6d + call m(idct_8x8_internal_10bpc).main_fast2 + call m(idct_16x8_internal_10bpc).main_fast2 + call .main_fast2 + call m(idct_16x16_internal_10bpc).main_end +.end: + mova m10, [o(idct32x8p)] +%if WIN64 + movaps xmm6, [cq+16*0] + movaps xmm7, [cq+16*1] +%endif + vzeroupper + psrlw m8, m10, 8 + mova m9, m8 + vpermi2w m8, m1, m5 + vpermt2w m1, m10, m5 + vprold m5, m9, 16 + vpermi2w m9, m3, m7 + vpermt2w m3, m10, m7 + vprold m10, 16 + mova m7, m5 + vpermi2w m5, m0, m4 + vpermt2w m0, m10, m4 + pxor m14, m14 + vpermi2w m7, m2, m6 + vpermt2w m2, m10, m6 +.zero_loop: + mova [cq+r6*4+64*3], m14 + mova [cq+r6*4+64*2], m14 + mova [cq+r6*4+64*1], m14 + mova [cq+r6*4+64*0], m14 + sub r6d, 64 + jge .zero_loop + punpckhdq m6, m5, m8 + punpckldq m5, m8 + punpckhdq m8, m7, m9 + punpckldq m7, m9 + punpckhdq m4, m2, m3 + punpckldq m2, m3 + punpckhdq m3, m0, m1 + punpckldq m0, m1 + vpbroadcastd m13, [o(pw_2048)] + vpbroadcastd m15, [o(pixel_10bpc_max)] + lea r5, [o_base_8bpc] + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + lea r6, [strideq*3] + pmulhrsw m0, m13 + pmulhrsw m1, m13 + pmulhrsw m2, m13 + pmulhrsw m3, m13 + call .write_32x4 + pmulhrsw m0, m13, m4 + pmulhrsw m1, m13, m5 + pmulhrsw m2, m13, m6 + pmulhrsw m3, m13, m7 +.write_32x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r6 ] + REPX {pmaxsw x, m14}, m0, m1, m2, m3 + REPX {pminsw x, m15}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r6 ], m3 + lea dstq, [dstq+strideq*4] + ret +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [o(dconly_10bpc)] + mov [cq], eobd + or r3d, 8 + add r6d, 640 + sar r6d, 10 + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw m2, r6d + paddsw m2, m3 +.dconly_loop: + paddsw m0, m2, [dstq+strideq*0] + paddsw m1, m2, [dstq+strideq*1] + psubusw m0, m3 + psubusw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +ALIGN function_align +.main_fast2: ; bottom three-quarters are zero + vbroadcasti32x4 m23, [o(pd_4091_3973)] + vbroadcasti32x4 m7, [o(pd_201_995)] + vbroadcasti32x4 m22, [o(pd_1380_601)] + vbroadcasti32x4 m9, [o(pd_3857_4052)] + pmulld m23, m16 ; t16 t20 + pmulld m16, m7 ; t31 t27 + pmulld m22, m17 ; -t19 -t25 + pmulld m17, m9 ; t28 t24 + REPX {paddd x, m13}, m23, m16, m17 + psubd m22, m13, m22 + REPX {psrad x, 12 }, m23, m16, m22, m17 + mova m20, m23 ; t30 t26 + mova m9, m16 ; t17 t21 + mova m19, m22 ; t18 t22 + mova m18, m17 ; t29 t25 + jmp .main3 +.main_fast: ; bottom half is zero + vbroadcasti32x4 m23, [o(pd_4091_3973)] + vbroadcasti32x4 m7, [o(pd_201_995)] + vbroadcasti32x4 m20, [o(pd_2751_2106)] + vbroadcasti32x4 m9, [o(pd_3035_3513)] + vbroadcasti32x4 m21, [o(pd_3703_3290)] + vbroadcasti32x4 m10, [o(pd_1751_2440)] + vbroadcasti32x4 m22, [o(pd_1380_601)] + vbroadcasti32x4 m11, [o(pd_3857_4052)] + pmulld m23, m16 ; t16a t20a + pmulld m16, m7 ; t31a t27a + pmulld m20, m19 ; -t17a -t21a + pmulld m19, m9 ; t30a t26a + pmulld m21, m18 ; t18a t22a + pmulld m18, m10 ; t29a t25a + pmulld m22, m17 ; -t19a -t25a + pmulld m17, m11 ; t28a t24a + psubd m20, m13, m20 + psubd m22, m13, m22 + jmp .main2 +.main: + ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973 + ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106 + ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290 + ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601 + paddd m20, m13 + paddd m22, m13 +.main2: + REPX {paddd x, m13}, m16, m23, m19 + REPX {psrad x, 12 }, m16, m20, m23, m19 + psubd m9, m16, m20 ; t17 t21 + paddd m16, m20 ; t16 t20 + psubd m20, m23, m19 ; t30 t26 + paddd m23, m19 ; t31 t27 + REPX {pmaxsd x, m14}, m9, m16, m20, m23 + REPX {paddd x, m13}, m21, m18, m17 + REPX {psrad x, 12 }, m18, m22, m21, m17 + psubd m19, m22, m18 ; t18 t22 + paddd m22, m18 ; t19 t23 + psubd m18, m17, m21 ; t29 t25 + paddd m17, m21 ; t28 t24 + REPX {pmaxsd x, m14}, m19, m22, m18, m17 + REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17 +.main3: + vbroadcasti32x4 m11, [o(pd_4017_2276)] + vbroadcasti32x4 m10, [o(pd_799_3406)] + psubd m7, m0, m6 ; dct16 out15 out14 + paddd m0, m6 ; dct16 out0 out1 + psubd m6, m1, m5 ; dct16 out12 out13 + paddd m1, m5 ; dct16 out3 out2 + psubd m5, m2, m4 ; dct16 out11 out10 + paddd m2, m4 ; dct16 out4 out5 + psubd m4, m3, m8 ; dct16 out8 out9 + paddd m3, m8 ; dct16 out7 out6 + ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11 + ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 + punpckhqdq m21, m16, m20 ; t20 t21a + punpcklqdq m16, m20 ; t16 t17a + punpcklqdq m20, m22, m19 ; t19 t18a + punpckhqdq m22, m19 ; t23 t22a + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + punpcklqdq m19, m23, m9 ; t31 t30a + punpckhqdq m23, m9 ; t27 t26a + punpckhqdq m9, m17, m18 ; t24 t25a + punpcklqdq m17, m18 ; t28 t29a + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + psubd m18, m16, m20 ; t19a t18 + paddd m20, m16 ; t16a t17 + psubd m16, m19, m17 ; t28a t29 + paddd m19, m17 ; t31a t30 + psubd m17, m22, m21 ; t20a t21 + paddd m22, m21 ; t23a t22 + psubd m21, m9, m23 ; t27a t26 + paddd m23, m9 ; t24a t25 + REPX {pmaxsd x, m14}, m18, m16, m17, m21 + REPX {pminsd x, m15}, m16, m18, m21, m17 + ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11 + ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m20, m22, m19, m23 + REPX {pminsd x, m15}, m20, m22, m19, m23 + paddd m9, m20, m22 ; t16 t17a + psubd m20, m22 ; t23 t22a + paddd m22, m19, m23 ; t31 t30a + psubd m19, m23 ; t24 t25a + psubd m23, m16, m17 ; t20a t21 + paddd m16, m17 ; t19a t18 + psubd m17, m18, m21 ; t27a t26 + paddd m21, m18 ; t28a t29 + REPX {pmaxsd x, m14}, m20, m19, m23, m17 + REPX {pminsd x, m15}, m19, m20, m17, m23 + REPX {pmulld x, m12}, m19, m20, m17, m23 + REPX {pmaxsd x, m14}, m22, m21, m16, m9 + paddd m19, m13 + paddd m17, m13 + REPX {pminsd x, m15}, m22, m21, m16, m9 + psubd m18, m19, m20 ; t23a t22 + paddd m19, m20 ; t24a t25 + paddd m20, m17, m23 ; t27 t26a + psubd m17, m23 ; t20 t21a + REPX {psrad x, 12 }, m20, m19, m18, m17 + ret + +cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob + vpbroadcastd m5, [pw_4096] + lea r4, [strideq*3] + mova m6, [idtx32x8p] + lea r5, [strideq*5] + vpbroadcastd m9, [pixel_10bpc_max] + lea r6, [strideq+r4*2] + pxor m8, m8 + sub eobd, 107 + psrlw m7, m6, 8 +.loop: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] ; 02 13 + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] ; 46 57 + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] ; 8a 9b + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] ; ce df + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m8}, 0, 1, 2, 3 + mova m4, m6 + vpermi2w m4, m1, m3 + vpermt2w m1, m7, m3 + REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 + mova m3, m7 + vpermi2w m3, m0, m2 + vpermt2w m0, m6, m2 + add cq, 64*8 + punpcklqdq m2, m3, m1 ; 4 5 + punpckhqdq m3, m1 ; 6 7 + punpckhqdq m1, m0, m4 ; 2 3 + punpcklqdq m0, m4 ; 0 1 + mova ym4, [dstq+strideq*0] + vinserti32x8 m4, [dstq+strideq*1], 1 + paddw m0, m4 + mova ym4, [dstq+strideq*2] + vinserti32x8 m4, [dstq+r4 *1], 1 + paddw m1, m4 + mova ym4, [dstq+strideq*4] + vinserti32x8 m4, [dstq+r5 *1], 1 + paddw m2, m4 + mova ym4, [dstq+r4 *2] + vinserti32x8 m4, [dstq+r6 *1], 1 + paddw m3, m4 + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+r4 *1], m1, 1 + mova [dstq+strideq*4], ym2 + vextracti32x8 [dstq+r5 *1], m2, 1 + mova [dstq+r4 *2], ym3 + vextracti32x8 [dstq+r6 *1], m3, 1 + add dstq, 32 + add eobd, 0x80000000 + jnc .loop + RET + +%endif ; ARCH_X86_64 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm index 4fb30ef4e7a..3833e17c99f 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm @@ -361,18 +361,32 @@ ALIGN function_align %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 0, 4x4 %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 - movd m1, [o(pw_2896x8)] + imul r5d, [cq], 181 mov [cq], eobd ; 0 - add r5d, 2048 - sar r5d, 12 + mov r3d, 4 +.dconly: + add r5d, 128 + sar r5d, 8 +.dconly2: + imul r5d, 2896 + mova m2, [o(pixel_10bpc_max)] + add r5d, 34816 movd m0, r5d - packssdw m0, m0 - pmulhrsw m0, m1 - pshuflw m0, m0, q0000 + pshuflw m0, m0, q1111 + pxor m3, m3 punpcklqdq m0, m0 - mova m1, m0 - TAIL_CALL m(iadst_4x4_internal_16bpc).end +.dconly_loop: + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + paddw m1, m0 + pminsw m1, m2 + pmaxsw m1, m3 + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET %endif %endmacro @@ -662,40 +676,13 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 4x8 %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 2 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 2048 - sar r5d, 12 -.end: - imul r5d, 2896 - add r5d, 34816 - movd m0, r5d - pshuflw m0, m0, q1111 - punpcklqdq m0, m0 - pxor m4, m4 - mova m3, [o(pixel_10bpc_max)] - lea r2, [strideq*3] -.loop: - movq m1, [dstq+strideq*0] - movq m2, [dstq+strideq*2] - movhps m1, [dstq+strideq*1] - movhps m2, [dstq+r2] - paddw m1, m0 - paddw m2, m0 - REPX {pminsw x, m3}, m1, m2 - REPX {pmaxsw x, m4}, m1, m2 - movq [dstq+strideq*0], m1 - movhps [dstq+strideq*1], m1 - movq [dstq+strideq*2], m2 - movhps [dstq+r2 ], m2 - lea dstq, [dstq+strideq*4] - dec r3d - jg .loop - RET + mov r3d, 8 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly %endif %endmacro @@ -944,12 +931,12 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 4 - add r5d, 6144 - sar r5d, 13 - jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end + mov r3d, 16 + add r5d, 384 + sar r5d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2 %endif %endmacro @@ -1297,13 +1284,13 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 2048 - sar r5d, 12 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 128 + sar r5d, 8 imul r5d, 2896 add r5d, 34816 movd m0, r5d @@ -1783,12 +1770,12 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 2 .end: - add r5d, 6144 - sar r5d, 13 + add r5d, 384 + sar r5d, 9 .end2: imul r5d, 2896 add r5d, 34816 @@ -2146,11 +2133,11 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 mov r3d, 4 %if stack_size_padded > 0 ; adjust to caller's stack allocation @@ -2477,12 +2464,12 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 4 .dconly: - add r5d, 6144 - sar r5d, 13 + add r5d, 384 + sar r5d, 9 .dconly2: imul r5d, 2896 add r5d, 34816 @@ -2755,6 +2742,8 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ret .round: %if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 pcmpeqd m8, m8 REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] @@ -2785,6 +2774,14 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 pcmpeqd m0, m0 REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7 mova [r3+ 1*16], m1 @@ -3472,12 +3469,12 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 %if ARCH_X86_32 add rsp, 1*16 %endif @@ -3939,11 +3936,11 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 add rsp, (5+ARCH_X86_64*3+WIN64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 %endif @@ -4057,6 +4054,8 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ret .round: %if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 psrld m8, m11, 10 ; 2 REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] @@ -4087,6 +4086,14 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 mova m0, [o(pd_2)] REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7 paddd m0, [r3+ 0*16] @@ -5162,11 +5169,11 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \ call m(idct_8x8_internal_16bpc).round1_and_write_8x8 ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 add rsp, (31+2*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2 @@ -5339,12 +5346,12 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \ %endif RET .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 add rsp, (65+4*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly @@ -5944,6 +5951,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ ; final sumsub for idct16 as well as idct32, plus final downshift %macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx mova m%4, [r3+16*(23-%1)] + pmaxsd m%1, m12 + pminsd m%1, m13 psubd m%3, m%1, m%4 ; idct16 out15 - n paddd m%1, m%4 ; idct16 out0 + n pmaxsd m%1, m12 @@ -6019,6 +6028,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ .loop_dct32_end: mova m0, [r3+16*16] mova m6, [r3+16*24] + pmaxsd m0, m2 + pminsd m0, m3 psubd m5, m0, m6 ; idct16 out15 - n paddd m0, m6 ; idct16 out0 + n pmaxsd m0, m2 @@ -6045,12 +6056,12 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ %endif .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 .dconly1: - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 @@ -6344,14 +6355,14 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ %endif .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 6144 - sar r5d, 13 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ @@ -6565,7 +6576,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 add rsp, (5*32+1-(24+8*ARCH_X86_32))*16 @@ -6838,11 +6849,11 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 @@ -7098,14 +7109,14 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 6144 - sar r5d, 13 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 @@ -7537,6 +7548,8 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ mova m5, [r3-16* 4] ; idct64 48 + n mova m6, [r4-16*20] ; idct64 47 - n mova m7, [r3-16*20] ; idct64 32 + n + pmaxsd m0, m12 + pminsd m0, m13 paddd m8, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n REPX {pmaxsd x, m12}, m8, m0 @@ -7565,11 +7578,13 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ mova [r4-16* 4], m6 mova [r3+16*12], m8 %else + mova m5, [o(clip_18b_min)] + mova m6, [o(clip_18b_max)] mova m1, [r3+16*44] ; idct16 15 - n + pmaxsd m0, m5 + pminsd m0, m6 paddd m4, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n - mova m5, [o(clip_18b_min)] - mova m6, [o(clip_18b_max)] REPX {pmaxsd x, m5}, m4, m0 REPX {pminsd x, m6}, m4, m0 paddd m1, m4, m3 ; idct32 out0 + n @@ -7632,12 +7647,12 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 .dconly1: - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 @@ -7876,14 +7891,14 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 6144 - sar r5d, 13 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 add rsp, (1+8*32+1*WIN64)*16 jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 @@ -8112,7 +8127,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \ diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm index 092c842786d..a67f053a61b 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm @@ -126,7 +126,7 @@ pw_m2751_3035x8: dw -2751*8, 3035*8 SECTION .text -; Code size reduction trickery: Intead of using rip-relative loads with +; Code size reduction trickery: Instead of using rip-relative loads with ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a ; single rip-relative lea and then address things relative from that with ; 1-byte offsets as long as data is within +-128 bytes of the base pointer. @@ -1194,13 +1194,9 @@ cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] + mov [cq], eobd pmulhrsw xm0, xm1 - movd xm2, [o(pw_2048)] - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mova m1, m0 - jmp m(iadst_8x4_internal_8bpc).end3 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 %endif %endmacro @@ -1340,20 +1336,20 @@ cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd + or r3d, 8 +.dconly: pmulhrsw xm0, xm2 - psrlw xm2, 3 ; pw_2048 +.dconly2: + movd xm2, [pw_2048] pmulhrsw xm0, xm1 + lea r2, [strideq*3] pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 -.end: - mov r2d, 2 -.end2: - lea r3, [strideq*3] -.loop: - WRITE_8X4 0, 0, 1, 2 +.dconly_loop: + WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2 lea dstq, [dstq+strideq*4] - dec r2d - jg .loop + sub r3d, 4 + jg .dconly_loop RET %endif %endmacro @@ -1543,13 +1539,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - psrlw xm2, 3 ; pw_2048 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 + or r3d, 16 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly %endif %endmacro @@ -1902,7 +1893,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd - mov r2d, 2 + or r3d, 4 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative @@ -1911,17 +1902,17 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 vpbroadcastw m0, xm0 pxor m3, m3 .dconly_loop: - mova xm1, [dstq] - vinserti128 m1, [dstq+strideq], 1 + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 punpckhbw m2, m1, m3 punpcklbw m1, m3 paddw m2, m0 paddw m1, m0 packuswb m1, m2 - mova [dstq], xm1 - vextracti128 [dstq+strideq], m1, 1 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] - dec r2d + sub r3d, 2 jg .dconly_loop RET %endif @@ -2162,7 +2153,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 4 + or r3d, 8 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -2473,7 +2464,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 8 + or r3d, 16 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -3120,13 +3111,8 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - pmulhrsw xm0, xm2 - psrlw xm2, 2 ; pw_2048 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mov r2d, 8 - jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly .full: REPX {pmulhrsw x, m9}, m12, m13, m14, m15 pmulhrsw m6, m9, [rsp+32*2] @@ -3290,7 +3276,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 8 + or r3d, 8 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative @@ -3307,7 +3293,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob packuswb m1, m2 mova [dstq], m1 add dstq, strideq - dec r2d + dec r3d jg .dconly_loop RET .normal: @@ -3672,7 +3658,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 16 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .full: mova [tmp1q-32*4], m1 @@ -3991,7 +3977,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 16 + or r3d, 16 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 @@ -4222,7 +4208,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ @@ -4486,7 +4472,7 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 32 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .normal: PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 @@ -4832,7 +4818,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 16 + or r3d, 16 .dconly: pmulhrsw xm0, xm2 movd xm2, [o(pw_2048)] @@ -4856,7 +4842,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob mova [dstq+32*0], m2 mova [dstq+32*1], m3 add dstq, strideq - dec r2d + dec r3d jg .dconly_loop RET .normal: @@ -4997,7 +4983,7 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 @@ -5200,7 +5186,7 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ @@ -5381,7 +5367,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm index 7d01bccb4f5..f30f4909287 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm @@ -29,7 +29,8 @@ %if ARCH_X86_64 SECTION_RODATA 64 -int8_permA: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 +const \ +int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 @@ -84,7 +85,7 @@ pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 -gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10 +gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 @@ -845,7 +846,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 punpcklwd m3, m5 ; dct8 in3 in5 punpckhwd m5, m2 ; dct16 in11 in5 punpcklwd m6, m2 ; dct4 in3 in1 -.main2: +cglobal_label .main2 vpbroadcastd m10, [o(pd_2048)] .main3: vpbroadcastq m13, [o(int_mshift)] @@ -1355,7 +1356,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m3, m3, q2031 jmp m(iadst_8x8_internal_8bpc).end2 ALIGN function_align -.main: +cglobal_label .main IDCT8_1D_PACKED ret @@ -1422,7 +1423,7 @@ ALIGN function_align punpckhqdq m0, m4 ; out0 -out1 ret ALIGN function_align -.main_pass2: +cglobal_label .main_pass2 IADST8_1D_PACKED 2 ret @@ -1499,8 +1500,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128 sar r6d, 8 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly @@ -1608,7 +1609,54 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpscatterdq [r3+ym8]{k2}, m2 RET ALIGN function_align -.main: +cglobal_label .main_fast2 ; bottom three-quarters are zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + vpbroadcastd ym3, [o(pw_401_4076x8)] + vpbroadcastd ym5, [o(pw_799_4017x8)] + vpbroadcastd ym4, [o(pw_m1189_3920x8)] + pxor ym6, ym6 + punpckhwd ym2, ym0, ym0 + pmulhrsw ym2, ym3 ; t8a t15a + punpcklwd ym7, ym1, ym1 + pmulhrsw ym7, ym5 ; t4a t7a + punpckhwd ym1, ym1 + pmulhrsw ym4, ym1 ; t11a t12a + vpcmpub k7, ym13, ym10, 6 + punpcklwd ym9, ym6, ym0 + psubsw ym0, ym2, ym4 ; t11a t12a + paddsw ym8, ym2, ym4 ; t8a t15a + mova ym1, ym7 + jmp .main5 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + pxor ym6, ym6 + punpckhwd ym8, ym0, ym0 + punpckhwd ym4, ym3, ym3 + punpckhwd ym5, ym2, ym2 + punpcklwd ym7, ym1, ym1 + punpckhwd ym1, ym1 + punpcklwd ym3, ym3 + punpcklwd ym9, ym6, ym0 + punpcklwd ym6, ym2 + vpbroadcastd ym2, [o(pw_401_4076x8)] + vpbroadcastd ym0, [o(pw_m2598_3166x8)] + vpbroadcastd ym11, [o(pw_1931_3612x8)] + vpbroadcastd ym12, [o(pw_m1189_3920x8)] + pmulhrsw ym8, ym2 ; t8a t15a + vpbroadcastd ym2, [o(pw_799_4017x8)] + pmulhrsw ym0, ym4 ; t9a t14a + vpbroadcastd ym4, [o(pw_m2276_3406x8)] + pmulhrsw ym5, ym11 ; t10a t13a + pmulhrsw ym1, ym12 ; t11a t12a + pmulhrsw ym7, ym2 ; t4a t7a + pmulhrsw ym3, ym4 ; t5a t6a + vpcmpub k7, ym13, ym10, 6 + jmp .main4 +ALIGN function_align +cglobal_label .main WRAP_YMM IDCT16_1D_PACKED ret @@ -1685,13 +1733,14 @@ ALIGN function_align vpermi2q m6, m0, m2 ; in4 in8 in6 in10 vpermt2q m1, m10, m3 ; in11 in7 in9 in5 .main: - vpbroadcastd m9, [o(pd_2048)] - vpbroadcastq m13, [o(int_mshift)] - kxnorb k1, k1, k1 punpcklwd m0, m4, m5 ; in0 in15 in2 in13 punpckhwd m4, m5 ; in12 in3 in14 in1 punpcklwd m5, m6, m1 ; in4 in11 in6 in9 punpckhwd m6, m1 ; in8 in7 in10 in5 +cglobal_label .main2 + vpbroadcastd m9, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + kxnorb k1, k1, k1 vpcmpub k7, m13, m9, 6 ; 0x33... pxor m8, m8 ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 @@ -1976,7 +2025,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd - mov r3d, 8 + or r3d, 8 .dconly: imul r6d, 181 add r6d, 128 @@ -2114,7 +2163,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vextracti32x4 [r3 +r4 ], m1, 3 RET ALIGN function_align -.main: +cglobal_label .main IDCT8_1D_PACKED ret @@ -2168,6 +2217,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call .main_pass2 + movshdup m4, [o(permC)] pmulhrsw m0, m6 pmulhrsw m1, m6 psrlq m6, m4, 4 @@ -2194,9 +2244,8 @@ ALIGN function_align IADST8_1D_PACKED 1 ret ALIGN function_align -.main_pass2: +cglobal_label .main_pass2 IADST8_1D_PACKED 2 - movshdup m4, [o(permC)] pxor m5, m5 psubd m5, m6 packssdw m6, m5 @@ -2222,6 +2271,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call m(iadst_16x8_internal_8bpc).main_pass2 + movshdup m4, [o(permC)] pmulhrsw m5, m6, m0 pmulhrsw m0, m6, m1 psrlq m1, m4, 12 @@ -2276,8 +2326,8 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 @@ -2456,7 +2506,7 @@ ALIGN function_align pmulhrsw m3, m4 ; t5a t6a jmp .main4 ALIGN function_align -.main: +cglobal_label .main IDCT16_1D_PACKED ret @@ -2562,6 +2612,7 @@ ALIGN function_align vshufi32x4 m1, m5, q2020 ; 2 3 vshufi32x4 m5, m7, m9, q2020 ; 10 11 vshufi32x4 m7, m9, q3131 ; 14 15 +cglobal_label .main_pass2b REPX {pshufd x, x, q1032}, m1, m3, m5, m7 call .main vpbroadcastd m8, [o(pw_2896x8)] @@ -2770,13 +2821,13 @@ ALIGN function_align vpermt2q m9, m12, m7 jmp m(idct_16x16_internal_8bpc).end -%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] - vpbroadcastd m%3, [o(pw_%4_%5x8)] - punpcklwd m%1, m%2, m%2 - pmulhrsw m%1, m%3 - vpbroadcastd m%3, [o(pw_%6_%7x8)] - punpckhwd m%2, m%2 - pmulhrsw m%2, m%3 +%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] + vpbroadcastd m%4, [o(pw_%5_%6x8)] + punpcklwd m%1, m%3, m%3 + pmulhrsw m%1, m%4 + vpbroadcastd m%4, [o(pw_%7_%8x8)] + punpckhwd m%2, m%3, m%3 + pmulhrsw m%2, m%4 %endmacro cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob @@ -2864,82 +2915,86 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 - pxor ym4, ym4 vpermt2q m2, m5, m6 ; 8 10 vpermt2q m16, m5, m17 ; 9 11 - mova ym5, ym4 - mova ym6, ym4 - mova ym7, ym4 vextracti32x8 ym3, m2, 1 ; 12 14 vextracti32x8 ym17, m16, 1 ; 13 15 - call m(idct_8x16_internal_8bpc).main + call m(idct_8x16_internal_8bpc).main_fast call .main_fast .end: - vpbroadcastd ym12, strided - vpbroadcastd m13, [o(pw_2048)] - pmulld ym7, ym12, [o(gather8d)] - REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11 + vpbroadcastd ym8, strided + pmulld ym8, [o(gather8d)] + call .main_end lea r3, [dstq+strideq*4] - shl strideq, 4 - lea r4, [dstq+strideq] - add r1, r3 kxnorb k1, k1, k1 - pxor m6, m6 + lea r4, [dstq+strideq*8] + pxor m9, m9 + lea r1, [r3+strideq*8] kmovb k2, k1 - vpgatherdq m12{k1}, [r0+ym7] + vpgatherdq m12{k1}, [r0+ym8] kmovb k1, k2 - vpgatherdq m13{k2}, [r3+ym7] + vpgatherdq m13{k2}, [r3+ym8] kmovb k2, k1 - vpgatherdq m14{k1}, [r4+ym7] + vpgatherdq m14{k1}, [r4+ym8] kmovb k1, k2 - vpgatherdq m15{k2}, [r1+ym7] - REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 - punpcklbw m4, m12, m6 - punpckhbw m12, m6 - paddw m0, m4 + vpgatherdq m15{k2}, [r1+ym8] + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m11, m12, m9 + punpckhbw m12, m9 + paddw m0, m11 paddw m1, m12 packuswb m0, m1 kmovb k2, k1 - vpscatterdq [r0+ym7]{k1}, m0 - punpcklbw m4, m13, m6 - punpckhbw m13, m6 - paddw m2, m4 + vpscatterdq [r0+ym8]{k1}, m0 + punpcklbw m12, m13, m9 + punpckhbw m13, m9 + paddw m2, m12 paddw m3, m13 packuswb m2, m3 kmovb k1, k2 - vpscatterdq [r3+ym7]{k2}, m2 - punpcklbw m4, m14, m6 - punpckhbw m14, m6 - paddw m8, m4 - paddw m9, m14 - packuswb m8, m9 + vpscatterdq [r3+ym8]{k2}, m2 + punpcklbw m13, m14, m9 + punpckhbw m14, m9 + paddw m4, m13 + paddw m5, m14 + packuswb m4, m5 kmovb k2, k1 - vpscatterdq [r4+ym7]{k1}, m8 - punpcklbw m4, m15, m6 - punpckhbw m15, m6 - paddw m10, m4 - paddw m11, m15 - packuswb m10, m11 - vpscatterdq [r1+ym7]{k2}, m10 + vpscatterdq [r4+ym8]{k1}, m4 + punpcklbw m14, m15, m9 + punpckhbw m15, m9 + paddw m6, m14 + paddw m7, m15 + packuswb m6, m7 + vpscatterdq [r1+ym8]{k2}, m6 RET .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 INIT_YMM avx512icl ALIGN function_align -.main_fast: ; bottom half is zero - ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a - ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a - ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a - ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a +cglobal_label .main_fast2 ; bottom three-quarters are zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + mova m11, m12 + mova m17, m20 + mova m15, m21 + mova m16, m14 + jmp .main4 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a jmp .main3 ALIGN function_align -.main: +cglobal_label .main punpcklwd m12, m21, m14 ; in31 in1 punpckhwd m14, m21 ; in3 in29 punpcklwd m21, m20, m15 ; in27 in5 @@ -2966,6 +3021,7 @@ ALIGN function_align paddsw m21, m16 ; t20 t27 psubsw m16, m14, m19 ; t22 t25 paddsw m14, m19 ; t23 t24 +.main4: ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a @@ -2997,8 +3053,8 @@ ALIGN function_align REPX {pshufb x, m18}, m20, m11, m21, m19 ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 - packssdw m18, m13 ; t23a t22 - packssdw m12, m15 ; t24a t25 + packssdw m18, m13 ; t23a t22 + packssdw m12, m15 ; t24a t25 ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 packssdw m16, m13 ; t20 t21a @@ -3007,32 +3063,27 @@ ALIGN function_align punpckhqdq m19, m21 ; t28a t29 punpcklqdq m21, m20, m11 ; t16 t17a punpckhqdq m20, m11 ; t31 t30a - psubsw m15, m1, m19 ; out28 out29 - paddsw m1, m19 ; out3 out2 - psubsw m9, m6, m13 ; out19 out18 - paddsw m6, m13 ; out12 out13 - psubsw m10, m5, m16 ; out20 out21 - paddsw m5, m16 ; out11 out10 - psubsw m19, m3, m12 ; out24 out25 - paddsw m3, m12 ; out7 out6 - psubsw m8, m7, m21 ; out16 out17 - paddsw m7, m21 ; out15 out14 - psubsw m21, m0, m20 ; out31 out30 - paddsw m0, m20 ; out0 out1 - psubsw m11, m4, m18 ; out23 out22 - paddsw m4, m18 ; out8 out9 - psubsw m18, m2, m14 ; out27 out26 - paddsw m2, m14 ; out4 out5 INIT_ZMM avx512icl - movu m16, [o(permD+3)] - vpermt2q m0, m16, m4 ; 0 1 8 9 - vpermt2q m8, m16, m19 ; 16 17 24 25 - vpermt2q m1, m16, m5 ; 3 2 11 10 - vpermt2q m9, m16, m18 ; 19 18 27 26 - vpermt2q m2, m16, m6 ; 4 5 12 13 - vpermt2q m10, m16, m15 ; 20 21 28 29 - vpermt2q m3, m16, m7 ; 7 6 15 14 - vpermt2q m11, m16, m21 ; 23 22 31 30 + mova m15, [o(permA)] + ret +cglobal_label .main_end + vpbroadcastd m10, [o(pw_2048)] + vpermt2q m0, m15, m1 ; t0 t1 t2 t3 + vpermt2q m20, m15, m19 ; t31 t30a t29 t28a + vpermt2q m2, m15, m3 ; t4 t5 t6 t7 + vpermt2q m14, m15, m12 ; t27 t26a t25 t24a + vpermt2q m4, m15, m5 ; t8 t9 t10 t11 + vpermt2q m18, m15, m16 ; t23a t22 t21a t20 + vpermt2q m6, m15, m7 ; t12 t13 t14 t15 + vpermt2q m13, m15, m21 ; t19a t18 t17a t16 + psubsw m7, m0, m20 ; out31 out30 out29 out28 + paddsw m0, m20 ; out0 out1 out2 out3 + psubsw m5, m2, m14 ; out27 out26 out25 out24 + paddsw m2, m14 ; out4 out5 out6 out7 + psubsw m3, m4, m18 ; out23 out22 out21 out20 + paddsw m4, m18 ; out8 out9 out10 out11 + psubsw m1, m6, m13 ; out19 out18 out17 out16 + paddsw m6, m13 ; out12 out13 out14 out15 vzeroupper ret @@ -3079,16 +3130,33 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob call m(idct_8x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast .pass2: - vpbroadcastd m12, [o(pw_8192)] - vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31 - vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30 - vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29 - vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28 - vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27 - vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26 - vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15 - vshufi32x4 m0, m8, q2020 ; 0 8 16 24 - REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m10, [o(pw_8192)] + vpermt2q m0, m15, m4 ; t0 t1 t9 t8 + vpermt2q m20, m15, m18 ; t31 t30a t23a t22 + vpermt2q m3, m15, m7 ; t7 t6 t14 t15 + vpermt2q m12, m15, m21 ; t25 t24a t17a t16 + vpermt2q m2, m15, m6 ; t4 t5 t13 t12 + vpermt2q m14, m15, m13 ; t23a t22 t21a t20 + vpermt2q m1, m15, m5 ; t3 t2 t10 t11 + vpermt2q m19, m15, m16 ; t27 t26a t19a t18 + psubsw m8, m0, m20 ; out31 out30 out22 out23 + paddsw m0, m20 ; out0 out1 out9 out8 + paddsw m6, m3, m12 ; out7 out6 out14 out15 + psubsw m3, m12 ; out24 out25 out17 out16 + psubsw m5, m2, m14 ; out27 out26 out18 out19 + paddsw m4, m2, m14 ; out4 out5 out13 out12 + psubsw m7, m1, m19 ; out28 out29 out21 out20 + paddsw m2, m1, m19 ; out3 out2 out10 out11 + vzeroupper + vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 + vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 + vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 + vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 + vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 + vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 + vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 + vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 call .main vpbroadcastd m8, [o(pw_2048)] @@ -3132,7 +3200,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 8 + or r3d, 8 .dconly2: imul r6d, 181 add r6d, 128+512 @@ -3158,7 +3226,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob jg .dconly_loop RET ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m10, [o(pd_2048)] .main2: ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a @@ -3535,7 +3603,7 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly ALIGN function_align .main_oddhalf_fast2: ; bottom three-quarters are zero @@ -3821,8 +3889,8 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -4603,7 +4671,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 ALIGN function_align .main_oddhalf_fast2: ; bottom three-quarters are zero @@ -5068,8 +5136,8 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 64 imul r6d, 181 - mov r3d, 64 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 @@ -5282,7 +5350,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob jnz .normal movsx r6d, word [cq] mov [cq], eobd - mov r3d, 16 + or r3d, 16 .dconly: imul r6d, 181 add r6d, 128+512 @@ -6012,8 +6080,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 64 imul r6d, 181 - mov r3d, 64 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -6674,8 +6742,8 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 32 imul r6d, 181 - mov r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -7117,7 +7185,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly ALIGN function_align .pass2_end: diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h new file mode 100644 index 00000000000..33c842a9ce4 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h @@ -0,0 +1,66 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/loopfilter.h" + +#define decl_loopfilter_sb_fns(ext) \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext)) + +decl_loopfilter_sb_fns(ssse3); +decl_loopfilter_sb_fns(avx2); +decl_loopfilter_sb_fns(avx512icl); + +static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm index 361ccc3b883..ed83000ac24 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm @@ -30,22 +30,24 @@ SECTION_RODATA 32 +pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 times 4 db 8, 9 times 4 db 0, 1 times 4 db 8, 9 -pw_1: times 16 dw 1 -pw_2: times 16 dw 2 -pw_3: times 16 dw 3 -; 4 and 16 need to be next to each other since they are used as alternates -; depending on whether bitdepth is 10 or 12 -pw_4: times 16 dw 4 -pw_16: times 16 dw 16 -pw_8: times 16 dw 8 -pw_4096: times 16 dw 4096 +pw_1: times 16 dw 1 +pw_2: times 16 dw 2 +pw_3: times 16 dw 3 +pw_4096: times 2 dw 4096 -pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 +; 10bpc/12bpc: +pw_4: times 2 dw 4 + times 2 dw 16 +clip_max: times 2 dw 511 + times 2 dw 2047 +clip_min: times 2 dw -512 + times 2 dw -2048 SECTION .text @@ -398,9 +400,10 @@ SECTION .text pmaxuw m2, [pw_1] ; I psrlw m1, m0, 4 ; H paddw m0, [pw_2] + vpbroadcastd m8, [r11] paddw m0, m0 paddw m0, m2 ; E - REPX {pmullw x, [r11]}, m0, m1, m2 + REPX {pmullw x, m8}, m0, m1, m2 psubw m8, m3, m4 ; p1-p0 psubw m9, m5, m6 ; q1-q0 @@ -430,7 +433,8 @@ SECTION .text pabsw m10, m10 pmaxuw m9, m10 %endif - pcmpgtw m9, [r11] ; !flat8in + vpbroadcastd m10, [r11] + pcmpgtw m9, m10 ; !flat8in psubw m10, m13, m3 ; p2-p1 pabsw m10, m10 @@ -503,7 +507,8 @@ SECTION .text pmaxuw m0, m2 pmaxuw m1, m10 pmaxuw m1, m0 - pcmpgtw m1, [r11] ; !flat8out + vpbroadcastd m0, [r11] + pcmpgtw m1, m0 ; !flat8out por m1, m9 ; !flat8in | !flat8out vpbroadcastd m2, [maskq+8] pand m10, m2, m12 @@ -544,12 +549,8 @@ SECTION .text %endif ; short filter - - vpbroadcastw m0, r7m - pcmpeqw m2, m2 - psrlw m0, 1 ; 511 or 2047 - pxor m2, m0 ; -512 or -2048 - + vpbroadcastd m0, [r11+8*1] ; 511 or 2047 + vpbroadcastd m2, [r11+8*2] ; -512 or -2048 psubw m10, m5, m4 paddw m11, m10, m10 paddw m11, m10 @@ -561,17 +562,18 @@ SECTION .text pminsw m10, m0 pmaxsw m10, m2 pand m8, m10 ; f&=fm - paddw m10, m8, [pw_3] - paddw m8, [pw_4] + vpbroadcastd m10, [pw_4] + paddw m10, m8 + paddw m8, [pw_3] REPX {pminsw x, m0}, m10, m8 psraw m10, 3 ; f2 psraw m8, 3 ; f1 - paddw m4, m10 - psubw m5, m8 + psubw m5, m10 + paddw m4, m8 - paddw m8, [pw_1] - psraw m8, 1 ; f=(f1+1)>>1 - pandn m8, m7, m8 ; f&=!hev + paddw m10, [pw_1] + psraw m10, 1 ; f=(f1+1)>>1 + pandn m8, m7, m10 ; f&=!hev paddw m3, m8 psubw m6, m8 pxor m8, m8 @@ -603,8 +605,8 @@ SECTION .text mova [rsp+ 0*32], m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 - psllw m8, m0, 3 ; p6*8 - paddw m8, [pw_8] + paddw m8, m0, [pw_1] + psllw m8, 3 ; p6*8+8 paddw m10, m2, m7 ; p5+p4 psubw m8, m0 paddw m10, m10 ; (p5+p4)*2 @@ -759,7 +761,6 @@ SECTION .text psubw m8, m15 paddw m8, m0 psrlw m10, m8, 4 - pand m10, m1 %ifidn %2, v mova m9, [tmpq+strideq*1] %else @@ -788,6 +789,7 @@ SECTION .text %if %1 >= 8 ; flat8 filter + vpbroadcastd m7, [pw_4096] %ifidn %2, v mova m0, [tmpq+strideq*0] ; p3 %else @@ -799,43 +801,43 @@ SECTION .text paddw m2, m0 ; p1+p0+p3 paddw m8, m5 ; 2*(p3+p2)+q0 paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 - pmulhrsw m7, m2, [pw_4096] + pmulhrsw m10, m2, m7 paddw m8, m3, m6 psubw m2, m1 paddw m2, m8 - pmulhrsw m8, m2, [pw_4096] + pmulhrsw m8, m2, m7 - paddw m10, m0, m3 - paddw m11, m4, m14 - psubw m2, m10 - paddw m2, m11 - pmulhrsw m10, m2, [pw_4096] + paddw m11, m0, m3 + paddw m1, m4, m14 + psubw m2, m11 + paddw m2, m1 + pmulhrsw m1, m2, m7 paddw m11, m0, m4 + pblendvb m4, m1, m9 paddw m1, m5, m15 psubw m2, m11 paddw m2, m1 - pmulhrsw m11, m2, [pw_4096] + pmulhrsw m11, m2, m7 paddw m2, m6 paddw m2, m15 paddw m1, m13, m5 + pblendvb m5, m11, m9 + pblendvb m13, m10, m9 psubw m2, m1 - pmulhrsw m1, m2, [pw_4096] + pmulhrsw m1, m2, m7 psubw m2, m3 + pblendvb m3, m8, m9 psubw m2, m6 - paddw m0, m15, m14 - paddw m2, m0 - pmulhrsw m2, [pw_4096] + pblendvb m6, m1, m9 + paddw m1, m15, m14 + paddw m2, m1 + pmulhrsw m2, m7 - vpblendvb m13, m13, m7, m9 - vpblendvb m3, m3, m8, m9 - vpblendvb m4, m4, m10, m9 - vpblendvb m5, m5, m11, m9 - vpblendvb m6, m6, m1, m9 - vpblendvb m14, m14, m2, m9 + pblendvb m14, m2, m9 %ifidn %2, v mova [tmpq+strideq*1], m13 ; p2 @@ -844,9 +846,7 @@ SECTION .text mova [dstq+strideq*0], m5 ; q0 mova [dstq+strideq*1], m6 ; q1 mova [dstq+strideq*2], m14 ; q2 -%else - mova m0, [rsp+5*32] -%if %1 == 8 +%elif %1 == 8 TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 ; write 8x16 @@ -871,29 +871,28 @@ SECTION .text vextracti128 [dstq+stride3q -8], m15, 1 lea dstq, [dstq+strideq*4] %else - mova m0, [rsp+6*32] + mova m8, [rsp+6*32] mova m1, [rsp+7*32] mova m2, [rsp+8*32] mova m7, [rsp+9*32] - mova m8, [rsp+5*32] - TRANSPOSE8X8W 0, 1, 2, 7, 8, 13, 3, 4, 9 + TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9 - mova [dstq+strideq*0-16], xm0 + mova [dstq+strideq*0-16], xm8 mova [dstq+strideq*1-16], xm1 mova [dstq+strideq*2-16], xm2 mova [dstq+stride3q -16], xm7 lea tmpq, [dstq+strideq*4] - mova [tmpq+strideq*0-16], xm8 + mova [tmpq+strideq*0-16], xm0 mova [tmpq+strideq*1-16], xm13 mova [tmpq+strideq*2-16], xm3 mova [tmpq+stride3q -16], xm4 lea tmpq, [tmpq+strideq*4] - vextracti128 [tmpq+strideq*0-16], m0, 1 + vextracti128 [tmpq+strideq*0-16], m8, 1 vextracti128 [tmpq+strideq*1-16], m1, 1 vextracti128 [tmpq+strideq*2-16], m2, 1 vextracti128 [tmpq+stride3q -16], m7, 1 lea tmpq, [tmpq+strideq*4] - vextracti128 [tmpq+strideq*0-16], m8, 1 + vextracti128 [tmpq+strideq*0-16], m0, 1 vextracti128 [tmpq+strideq*1-16], m13, 1 vextracti128 [tmpq+strideq*2-16], m3, 1 vextracti128 [tmpq+stride3q -16], m4, 1 @@ -924,39 +923,38 @@ SECTION .text vextracti128 [dstq+stride3q ], m3, 1 lea dstq, [dstq+strideq*4] %endif -%endif %elif %1 == 6 ; flat6 filter - + vpbroadcastd m7, [pw_4096] paddw m8, m3, m4 paddw m8, m13 ; p2+p1+p0 paddw m11, m13, m5 paddw m8, m8 paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 - pmulhrsw m2, m8, [pw_4096] + pmulhrsw m2, m8, m7 paddw m8, m5 paddw m11, m13, m13 paddw m8, m6 psubw m8, m11 - pmulhrsw m10, m8, [pw_4096] + pmulhrsw m10, m8, m7 paddw m8, m6 paddw m11, m13, m3 paddw m8, m14 psubw m8, m11 - pmulhrsw m11, m8, [pw_4096] + pmulhrsw m11, m8, m7 psubw m8, m3 paddw m14, m14 psubw m8, m4 paddw m8, m14 - pmulhrsw m8, [pw_4096] + pmulhrsw m8, m7 - vpblendvb m3, m3, m2, m9 - vpblendvb m4, m4, m10, m9 - vpblendvb m5, m5, m11, m9 - vpblendvb m6, m6, m8, m9 + pblendvb m3, m2, m9 + pblendvb m4, m10, m9 + pblendvb m5, m11, m9 + pblendvb m6, m8, m9 %ifidn %2, v mova [tmpq+strideq*2], m3 ; p1 @@ -982,10 +980,10 @@ INIT_YMM avx2 cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq @@ -1013,7 +1011,7 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ test [maskq+0], mask_bitsd ; vmask[0] jz .end - FILTER 4, v + call .v4 .end: pslld m12, 4 @@ -1023,15 +1021,19 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ sub wd, 4 jg .loop RET +ALIGN function_align +.v4: + FILTER 4, v + ret INIT_YMM avx2 cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 @@ -1058,7 +1060,7 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter - FILTER 4, h + call .h4 jmp .end .no_filter: @@ -1071,15 +1073,19 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ sub hd, 4 jg .loop RET +ALIGN function_align +.h4: + FILTER 4, h + ret INIT_YMM avx2 cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq @@ -1100,7 +1106,7 @@ cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ test [maskq+0], mask_bitsd ; vmask[0] jz .end - FILTER 4, v + call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4 .end: pslld m12, 4 @@ -1115,10 +1121,10 @@ INIT_YMM avx2 cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 @@ -1138,7 +1144,7 @@ cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter - FILTER 4, h + call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4 jmp .end .no_filter: diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm new file mode 100644 index 00000000000..b7bc3aa106f --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm @@ -0,0 +1,912 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +l_shuf_v: times 2 db 0, 32 +pw_1: times 2 dw 1 + times 2 db 4, 36 +pw_3: times 2 dw 3 + times 2 db 8, 40 +pw_4: times 2 dw 4 + times 2 db 12, 44 +pw_16: times 2 dw 16 + times 2 db 16, 48 +pw_4096: times 2 dw 4096 + times 2 db 20, 52 +pw_16384: times 2 dw 16384 + times 2 db 24, 56 +pw_32767: times 2 dw 32767 + times 2 db 28, 60 + times 2 dw 0 +filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128 +stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25 +l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1 +clip_max: dw 511, 511, 2047, 2047 +clip_min: dw -512, -512, -2048, -2048 + +SECTION .text + +%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp + punpckhwd m%9, m%5, m%6 + punpcklwd m%5, m%6 + punpckhwd m%6, m%1, m%2 + punpcklwd m%1, m%2 + punpckhwd m%2, m%7, m%8 + punpcklwd m%7, m%8 + punpckhwd m%8, m%3, m%4 + punpcklwd m%3, m%4 + punpckhdq m%4, m%1, m%3 + punpckldq m%1, m%3 + punpckldq m%3, m%5, m%7 + punpckhdq m%5, m%7 + punpckhdq m%7, m%6, m%8 + punpckldq m%6, m%8 + punpckldq m%8, m%9, m%2 + punpckhdq m%9, m%2 + punpckhqdq m%2, m%1, m%3 + punpcklqdq m%1, m%3 + punpcklqdq m%3, m%4, m%5 + punpckhqdq m%4, m%5 + punpcklqdq m%5, m%6, m%8 + punpckhqdq m%6, m%8 + punpckhqdq m%8, m%7, m%9 + punpcklqdq m%7, m%9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] +%ifidn %2, v +%if %1 == 16 + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1 ] + mova m1, [tmpq+strideq*2 ] ; p5 + mova m2, [tmpq+stride3q ] ; p4 + mova m3, [tmpq+strideq*4 ] ; p3 + mova m4, [tmpq+stride5q ] ; p2 +%elif %1 == 6 || %1 == 8 + lea tmpq, [dstq+mstrideq*4] +%if %1 == 8 + mova m3, [tmpq+strideq*0 ] +%endif + mova m4, [tmpq+strideq*1 ] +%endif + mova m5, [dstq+mstrideq*2] ; p1 + mova m6, [dstq+mstrideq*1] ; p0 + mova m7, [dstq+strideq*0 ] ; q0 + mova m8, [dstq+strideq*1 ] ; q1 +%if %1 != 4 + mova m9, [dstq+strideq*2 ] ; q2 +%endif +%if %1 == 8 || %1 == 16 + mova m10, [dstq+stride3q ] ; q3 +%endif +%if %1 == 16 + mova m11, [dstq+strideq*4 ] ; q4 + mova m22, [dstq+stride5q ] ; q5 + mova m23, [dstq+stride3q*2] +%endif +%else ; h +%if %1 == 16 + movu ym16, [dstq+strideq*0 -16] + movu ym17, [dstq+strideq*1 -16] + movu ym18, [dstq+strideq*2 -16] + movu ym19, [dstq+stride3q -16] + movu ym20, [dstq+strideq*4 -16] + movu ym22, [dstq+stride5q -16] + movu ym23, [dstq+stride3q*2-16] + movu ym28, [dstq+stride7q -16] + lea tmpq, [dstq+strideq*8 -16] + vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1 + vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1 + vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1 + vinserti32x8 m10, m19, [tmpq+stride3q ], 1 + vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1 + vinserti32x8 m22, m22, [tmpq+stride5q ], 1 + vinserti32x8 m23, m23, [tmpq+stride3q*2], 1 + vinserti32x8 m28, m28, [tmpq+stride7q ], 1 + lea tmpq, [tmpq+strideq*8] + TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27 + movu ym16, [tmpq+strideq*0 ] + movu ym17, [tmpq+strideq*1 ] + movu ym18, [tmpq+strideq*2 ] + movu ym19, [tmpq+stride3q ] + movu ym24, [tmpq+strideq*4 ] + movu ym25, [tmpq+stride5q ] + movu ym26, [tmpq+stride3q*2] + movu ym20, [tmpq+stride7q ] + lea tmpq, [tmpq+strideq*8] + vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1 + vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1 + vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1 + vinserti32x8 m3, m19, [tmpq+stride3q ], 1 + vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1 + vinserti32x8 m5, m25, [tmpq+stride5q ], 1 + vinserti32x8 m6, m26, [tmpq+stride3q*2], 1 + vinserti32x8 m20, m20, [tmpq+stride7q ], 1 + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27 + vshufi32x4 m27, m7, m0, q2020 + vshufi32x4 m7, m0, q3131 + vshufi32x4 m0, m8, m1, q2020 + vshufi32x4 m8, m1, q3131 + vshufi32x4 m1, m9, m2, q2020 + vshufi32x4 m9, m2, q3131 + vshufi32x4 m2, m10, m3, q2020 + vshufi32x4 m10, m3, q3131 + vshufi32x4 m3, m11, m4, q2020 + vshufi32x4 m11, m4, q3131 + vshufi32x4 m4, m22, m5, q2020 + vshufi32x4 m22, m5, q3131 + vshufi32x4 m5, m23, m6, q2020 + vshufi32x4 m23, m6, q3131 + vshufi32x4 m6, m28, m20, q2020 + vshufi32x4 m28, m20, q3131 +%elif %1 == 6 || %1 == 8 +%if %1 == 8 + sub dstq, 8 + movu xm16, [dstq+strideq*0 ] + movu xm17, [dstq+strideq*1 ] + movu xm18, [dstq+strideq*2 ] + movu xm19, [dstq+stride3q ] + movu xm24, [dstq+strideq*4 ] + movu xm25, [dstq+stride5q ] + movu xm26, [dstq+stride3q*2] + movu xm27, [dstq+stride7q ] + lea tmpq, [dstq+strideq*8 ] + vinserti128 ym16, [tmpq+strideq*0 ], 1 + vinserti128 ym17, [tmpq+strideq*1 ], 1 + vinserti128 ym18, [tmpq+strideq*2 ], 1 + vinserti128 ym19, [tmpq+stride3q ], 1 + vinserti128 ym24, [tmpq+strideq*4 ], 1 + vinserti128 ym25, [tmpq+stride5q ], 1 + vinserti128 ym26, [tmpq+stride3q*2], 1 + vinserti128 ym27, [tmpq+stride7q ], 1 + lea tmpq, [tmpq+strideq*8 ] + vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2 + vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2 + vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2 + vinserti32x4 m7, m19, [tmpq+stride3q ], 2 + vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2 + vinserti32x4 m9, m25, [tmpq+stride5q ], 2 + vinserti32x4 m3, m26, [tmpq+stride3q*2], 2 + vinserti32x4 m4, m27, [tmpq+stride7q ], 2 + lea tmpq, [tmpq+strideq*8 ] + vinserti32x4 m10, [tmpq+strideq*0 ], 3 + vinserti32x4 m8, [tmpq+strideq*1 ], 3 + vinserti32x4 m5, [tmpq+strideq*2 ], 3 + vinserti32x4 m7, [tmpq+stride3q ], 3 + vinserti32x4 m2, [tmpq+strideq*4 ], 3 + vinserti32x4 m9, [tmpq+stride5q ], 3 + vinserti32x4 m3, [tmpq+stride3q*2], 3 + vinserti32x4 m4, [tmpq+stride7q ], 3 +%else ; %1 == 6 + movu xm16, [dstq+strideq*0-8] + movu xm17, [dstq+strideq*1-8] + movu xm18, [dstq+strideq*2-8] + movu xm19, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4-8] + movu xm2, [tmpq+strideq*0] + movu xm9, [tmpq+strideq*1] + movu xm3, [tmpq+strideq*2] + movu xm4, [tmpq+stride3q ] + lea tmpq, [tmpq+strideq*4] + vinserti128 ym16, [tmpq+strideq*0], 1 + vinserti128 ym17, [tmpq+strideq*1], 1 + vinserti128 ym18, [tmpq+strideq*2], 1 + vinserti128 ym19, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 ym2, [tmpq+strideq*0], 1 + vinserti128 ym9, [tmpq+strideq*1], 1 + vinserti128 ym3, [tmpq+strideq*2], 1 + vinserti128 ym4, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m10, m16, [tmpq+strideq*0], 2 + vinserti32x4 m8, m17, [tmpq+strideq*1], 2 + vinserti32x4 m5, m18, [tmpq+strideq*2], 2 + vinserti32x4 m7, m19, [tmpq+stride3q ], 2 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m2, [tmpq+strideq*0], 2 + vinserti32x4 m9, [tmpq+strideq*1], 2 + vinserti32x4 m3, [tmpq+strideq*2], 2 + vinserti32x4 m4, [tmpq+stride3q ], 2 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m10, [tmpq+strideq*0], 3 + vinserti32x4 m8, [tmpq+strideq*1], 3 + vinserti32x4 m5, [tmpq+strideq*2], 3 + vinserti32x4 m7, [tmpq+stride3q ], 3 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m2, [tmpq+strideq*0], 3 + vinserti32x4 m9, [tmpq+strideq*1], 3 + vinserti32x4 m3, [tmpq+strideq*2], 3 + vinserti32x4 m4, [tmpq+stride3q ], 3 +%endif + punpcklwd m6, m10, m8 + punpckhwd m10, m8 + punpcklwd m8, m5, m7 + punpckhwd m5, m7 + punpcklwd m7, m2, m9 + punpckhwd m2, m9 + punpcklwd m9, m3, m4 + punpckhwd m3, m4 + punpckldq m4, m6, m8 + punpckhdq m6, m8 + punpckldq m8, m10, m5 + punpckhdq m10, m5 + punpckldq m5, m7, m9 + punpckhdq m7, m9 + punpckldq m9, m2, m3 + punpckhdq m2, m3 +%if %1 == 8 + punpcklqdq m3, m4, m5 +%endif + punpckhqdq m4, m5 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + punpcklqdq m7, m8, m9 + punpckhqdq m8, m9 + punpcklqdq m9, m10, m2 +%if %1 == 8 + punpckhqdq m10, m2 +%endif +%else ; %1 == 4 + kxnorb k1, k1, k1 + kmovb k2, k1 + vpgatherdq m7{k1}, [dstq+ym12-4] + lea tmpq, [dstq+strideq*2-4] + kmovb k1, k2 + vpgatherdq m4{k2}, [tmpq+ym12] + lea tmpq, [tmpq+strideq*2] + kmovb k2, k1 + vpgatherdq m5{k1}, [tmpq+ym12] + lea tmpq, [tmpq+strideq*2] + vpgatherdq m6{k2}, [tmpq+ym12] + punpcklwd m8, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m6, m8, m7 + punpckhwd m8, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + punpcklqdq m7, m8, m4 + punpckhqdq m8, m4 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + movu ym16, [lq+l_strideq*1] + movsldup m17, [l_shuf_v] + vptestnmb k1, ym16, ym16 + vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][] + vpermb m16, m17, m16 ; l[x][1] +%else + movq xm16, [lq+l_strideq*0] + movq xm17, [lq+l_strideq*1] + vinserti128 ym16, [lq+l_strideq*2], 1 + vinserti128 ym17, [lq+l_stride3q ], 1 + lea tmpq, [lq+l_strideq*4] + vinserti32x4 m16, [tmpq+l_strideq*0], 2 + vinserti32x4 m17, [tmpq+l_strideq*1], 2 + vinserti32x4 m16, [tmpq+l_strideq*2], 3 + vinserti32x4 m17, [tmpq+l_stride3q ], 3 + punpcklqdq m16, m17 + vbroadcasti32x4 m17, [l_shuf_h] + vptestnmb k1, m16, m16 + vpalignr m16{k1}, m16, 12 + pshufb m16, m17 ; l[x][1] +%endif + vpbroadcastd m20, [pw_32767] + psubw m17, m5, m6 ; p1-p0 + psubw m18, m7, m8 ; q1-q0 + vptestmw k1, m16, m16 ; L + pabsw m17, m17 + pabsw m18, m18 + vpmaxuw m20{k1}, m17, m18 + vpbroadcastw m17, [lutq+136] + psrlw m18, m16, [lutq+128] + vpbroadcastd m19, [pw_1] + pminuw m18, m17 + psrlw m17, m16, 4 ; H + paddw m16, m16 + pmaxuw m18, m19 ; I + vpaddd m16, [pw_4] {1to16} + paddw m16, m18 ; E + REPX {pmullw x, m13}, m17, m18, m16 + vpcmpw k4, m20, m17, 6 ; hev +%if %1 != 4 + psubw m19, m4, m5 ; p2-p1 + pabsw m19, m19 +%if %1 == 8 || %1 == 16 + psubw m17, m3, m4 ; p3-p2 + pabsw m17, m17 + pmaxuw m19, m17 + psubw m17, m9, m10 ; q3-q2 + pabsw m17, m17 + pmaxuw m19, m17 +%endif + psubw m17, m9, m8 ; q2-q1 + pabsw m17, m17 + pmaxuw m19, m17 +%if %1 == 16 + vpbroadcastd ym17, [maskq+4] + vpord ym17, [maskq+8] {1to8} + vptestmd k1, ym17, ym21 +%else + vptestmd k1, ym21, [maskq+4] {1to8} +%endif + pmaxuw m19, m20 + psubw m17, m4, m6 ; p2-p0 + pabsw m17, m17 + pmaxuw m17, m20 + vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks +%if %1 == 8 || %1 == 16 + psubw m19, m3, m6 ; p3-p0 + pabsw m19, m19 + pmaxuw m17, m19 + psubw m19, m7, m10 ; q3-q0 + pabsw m19, m19 + pmaxuw m17, m19 +%endif + psubw m19, m7, m9 ; q2-q0 + pabsw m19, m19 + pmaxuw m17, m19 +%endif + vpcmpw k1, m20, m18, 2 + psubw m18, m5, m8 ; p1-q1 + psubw m19, m6, m7 ; p0-q0 + pabsw m18, m18 + pabsw m19, m19 + psrlw m18, 1 + paddw m19, m19 + paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E +%if %1 != 4 + vpcmpw k2{k1}, m17, m13, 2 ; flat8in +%endif +%if %1 == 16 + psubw m20, m0, m6 + psubw m16, m1, m6 + pabsw m20, m20 + psubw m17, m2, m6 + pabsw m16, m16 + psubw m18, m11, m7 + pabsw m17, m17 + psubw m19, m22, m7 + pabsw m18, m18 + pmaxuw m20, m16 + psubw m16, m23, m7 + pabsw m19, m19 + pmaxuw m17, m18 + pabsw m16, m16 + vpandd ym18, ym21, [maskq+8] {1to8} + pmaxuw m20, m17 + pmaxuw m19, m16 + pcmpeqd ym16, ym21, ym18 + vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8 + pmaxuw m20, m19 + pcmpeqd ym17, ym21, ym18 + vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8 + vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out + pcmpeqd ym18, ym21 + vptestmb k3{k3}, ym16, ym16 ; flat8 & fm + vptestmb k2{k2}, ym17, ym17 ; flat8in + vptestmb k1{k1}, ym18, ym18 + kandnd k1, k2, k1 ; fm & !flat8 & !flat16 + kandnd k2, k3, k2 ; flat8 & !flat16 +%elif %1 == 6 || %1 == 8 + vpandd ym17, ym21, [maskq+4] {1to8} + pcmpeqd ym16, ym21, ym17 + vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8 + pcmpeqd ym17, ym21 + vptestmb k2{k2}, ym16, ym16 ; flat8 & fm + vptestmb k1{k1}, ym17, ym17 + kandnd k1, k2, k1 ; fm & !flat8 +%else ; %1 == 4 + vpandd ym16, ym21, [maskq+0] {1to8} + pcmpeqd ym16, ym21 + vptestmb k1{k1}, ym16, ym16 +%endif + + ; short filter + psubw m16, m7, m6 + vpbroadcastd m17, [pw_3] + paddw m18, m16, m16 + paddw m18, m16 + psubw m16, m5, m8 ; iclip_diff(p1-q1) + pminsw m16, m14 + vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev + knotd k4, k4 ; !hev + paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f) + vpbroadcastd m18, [pw_4] + pminsw m16, m14 + vpmaxsw m16{k1}{z}, m15 ; f&=fm + paddw m17, m16 + paddw m16, m18 + vpbroadcastd m18, [pw_16384] + pminsw m17, m14 + pminsw m16, m14 + psraw m17, 3 ; f2 + psraw m16, 3 ; f1 + paddw m6, m17 + psubw m7, m16 + vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev + psubw m17, m14, m15 ; 1023 or 4095 + pxor m18, m18 + paddw m5, m16 + psubw m8, m16 + REPX {pminsw x, m17}, m6, m7, m5, m8 + REPX {pmaxsw x, m18}, m6, m7, m5, m8 + +%if %1 == 16 ; flat16 filter + vpaddd m19, m0, [pw_1] {1to16} + paddw m16, m1, m2 ; p5+p4 + paddw m26, m1, m6 ; p5+p0 + paddw m24, m2, m7 ; p4+q0 + paddw m16, m4 ; p5+p4+p3 + paddw m17, m3, m5 ; p2+p1 + psllw m19, 3 + paddw m16, m26 ; p5*2+p4+p3+p0 + paddw m17, m24 ; p4+p2+p1+q0 + psubw m19, m0 ; p6*7+8 + paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0 + paddw m18, m3, m8 + paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0 + paddw m25, m1, m0 + paddw m16, m0, m0 + psrlw m1{k3}, m19, 4 + paddw m19, m18 + psubw m19, m16 ; +p3+q1-p6*2 + paddw m16, m2, m0 + psrlw m2{k3}, m19, 4 + psubw m19, m25 + paddw m25, m4, m9 + paddw m20, m10, m5 + paddw m19, m25 ; +p2+q2-p6-p5 + paddw m17, m0, m3 + psubw m16, m20, m16 + psrlw m3{k3}, m19, 4 + paddw m19, m16 ; +p1+q3-p6-p4 + paddw m16, m11, m6 + psubw m16, m17 + paddw m17, m0, m4 + psrlw m4{k3}, m19, 4 + paddw m19, m16 ; +p0+q4-p6-p3 + paddw m16, m22, m7 + psubw m16, m17 + paddw m17, m0, m5 + psrlw m5{k3}, m19, 4 + paddw m19, m16 ; +q0+q5-p6-p2 + paddw m16, m23, m8 + psrlw m6{k3}, m19, 4 + psubw m16, m17 + paddw m19, m16 ; +q1+q6-p6-p1 + paddw m16, m23, m9 + psrlw m7{k3}, m19, 4 + psubw m16, m26 + paddw m19, m16 ; +q2+q6-p5-p0 + paddw m16, m23, m10 + psrlw m8{k3}, m19, 4 + psubw m16, m24 + paddw m19, m16 ; +q3+q6-p4-p0 + paddw m16, m23, m11 + psrlw m9{k3}, m19, 4 + psubw m16, m18 + paddw m19, m16 ; +q4+q6-p3-q1 + paddw m16, m23, m22 + psrlw m10{k3}, m19, 4 + psubw m16, m25 + paddw m19, m16 ; +q5+q6-p2-q2 + paddw m16, m23, m23 + psrlw m11{k3}, m19, 4 + psubw m16, m20 + paddw m19, m16 ; +q6*2-p1-q3 + psrlw m22{k3}, m19, 4 +%endif +%if %1 == 8 || %1 == 16 ; flat8 filter + vpbroadcastd m20, [pw_4096] + paddw m16, m3, m4 ; p3+p2 + paddw m19, m5, m6 ; p1+p0 + paddw m17, m16, m16 ; 2*(p3+p2) + paddw m19, m3 ; p1+p0+p3 + paddw m17, m7 ; 2*(p3+p2)+q0 + paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0 + paddw m18, m4, m7 + pmulhrsw m4{k2}, m19, m20 + psubw m19, m16 + paddw m17, m5, m8 + paddw m16, m3, m5 + paddw m19, m17 + pmulhrsw m5{k2}, m19, m20 + psubw m19, m16 + paddw m16, m6, m9 + paddw m19, m16 + paddw m16, m3, m6 + pmulhrsw m6{k2}, m19, m20 + paddw m19, m10 + psubw m16, m7, m16 + paddw m19, m16 + psubw m16, m10, m18 + pmulhrsw m7{k2}, m19, m20 + paddw m16, m8 + paddw m19, m16 + psubw m16, m10, m17 + pmulhrsw m8{k2}, m19, m20 + paddw m16, m9 + paddw m19, m16 + pmulhrsw m9{k2}, m19, m20 +%elif %1 == 6 ; flat6 filter + vpbroadcastd m10, [pw_4096] + paddw m2, m5, m6 + paddw m0, m4, m7 + paddw m1, m2, m4 ; p2+p1+p0 + paddw m3, m4, m4 + paddw m1, m1 + paddw m4, m5 + paddw m1, m0 ; p2+2*(p2+p1+p0)+q0 + psubw m3, m7, m3 + pmulhrsw m5{k2}, m1, m10 + paddw m3, m8 + psubw m4, m8, m4 + paddw m1, m3 + pmulhrsw m6{k2}, m1, m10 + paddw m4, m9 + paddw m9, m9 + paddw m1, m4 + pmulhrsw m7{k2}, m1, m10 + psubw m9, m2 + paddw m1, m9 + pmulhrsw m8{k2}, m1, m10 +%endif + +%ifidn %2, v +%if %1 == 16 + mova [tmpq+strideq*2 ], m1 ; p5 + mova [tmpq+stride3q ], m2 ; p4 + mova [tmpq+strideq*4 ], m3 ; p3 + mova [tmpq+stride5q ], m4 ; p2 +%elif %1 == 8 + mova [tmpq+strideq*1 ], m4 ; p2 +%endif + mova [dstq+mstrideq*2], m5 ; p1 + mova [dstq+mstrideq ], m6 ; p0 + mova [dstq+strideq*0 ], m7 ; q0 + mova [dstq+strideq*1 ], m8 ; q1 +%if %1 == 8 || %1 == 16 + mova [dstq+strideq*2 ], m9 ; q2 +%endif +%if %1 == 16 + mova [dstq+stride3q ], m10 ; q3 + mova [dstq+strideq*4 ], m11 ; q4 + mova [dstq+stride5q ], m22 ; q5 +%endif +%else +%if %1 == 16 + TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20 + TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20 + mova [dstq+strideq*0 -16], xm27 + mova [dstq+strideq*0 ], xm7 + mova [dstq+strideq*1 -16], xm0 + mova [dstq+strideq*1 ], xm8 + mova [dstq+strideq*2 -16], xm1 + mova [dstq+strideq*2 ], xm9 + mova [dstq+stride3q -16], xm2 + mova [dstq+stride3q ], xm10 + mova [dstq+strideq*4 -16], xm3 + mova [dstq+strideq*4 ], xm11 + mova [dstq+stride5q -16], xm4 + mova [dstq+stride5q ], xm22 + mova [dstq+stride3q*2-16], xm5 + mova [dstq+stride3q*2 ], xm23 + mova [dstq+stride7q -16], xm6 + mova [dstq+stride7q ], xm28 + lea dstq, [dstq+strideq*8] + vextracti128 [dstq+strideq*0 -16], ym27, 1 + vextracti128 [dstq+strideq*0 ], ym7, 1 + vextracti128 [dstq+strideq*1 -16], ym0, 1 + vextracti128 [dstq+strideq*1 ], ym8, 1 + vextracti128 [dstq+strideq*2 -16], ym1, 1 + vextracti128 [dstq+strideq*2 ], ym9, 1 + vextracti128 [dstq+stride3q -16], ym2, 1 + vextracti128 [dstq+stride3q ], ym10, 1 + vextracti128 [dstq+strideq*4 -16], ym3, 1 + vextracti128 [dstq+strideq*4 ], ym11, 1 + vextracti128 [dstq+stride5q -16], ym4, 1 + vextracti128 [dstq+stride5q ], ym22, 1 + vextracti128 [dstq+stride3q*2-16], ym5, 1 + vextracti128 [dstq+stride3q*2 ], ym23, 1 + vextracti128 [dstq+stride7q -16], ym6, 1 + vextracti128 [dstq+stride7q ], ym28, 1 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 -16], m27, 2 + vextracti32x4 [dstq+strideq*0 ], m7, 2 + vextracti32x4 [dstq+strideq*1 -16], m0, 2 + vextracti32x4 [dstq+strideq*1 ], m8, 2 + vextracti32x4 [dstq+strideq*2 -16], m1, 2 + vextracti32x4 [dstq+strideq*2 ], m9, 2 + vextracti32x4 [dstq+stride3q -16], m2, 2 + vextracti32x4 [dstq+stride3q ], m10, 2 + vextracti32x4 [dstq+strideq*4 -16], m3, 2 + vextracti32x4 [dstq+strideq*4 ], m11, 2 + vextracti32x4 [dstq+stride5q -16], m4, 2 + vextracti32x4 [dstq+stride5q ], m22, 2 + vextracti32x4 [dstq+stride3q*2-16], m5, 2 + vextracti32x4 [dstq+stride3q*2 ], m23, 2 + vextracti32x4 [dstq+stride7q -16], m6, 2 + vextracti32x4 [dstq+stride7q ], m28, 2 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 -16], m27, 3 + vextracti32x4 [dstq+strideq*0 ], m7, 3 + vextracti32x4 [dstq+strideq*1 -16], m0, 3 + vextracti32x4 [dstq+strideq*1 ], m8, 3 + vextracti32x4 [dstq+strideq*2 -16], m1, 3 + vextracti32x4 [dstq+strideq*2 ], m9, 3 + vextracti32x4 [dstq+stride3q -16], m2, 3 + vextracti32x4 [dstq+stride3q ], m10, 3 + vextracti32x4 [dstq+strideq*4 -16], m3, 3 + vextracti32x4 [dstq+strideq*4 ], m11, 3 + vextracti32x4 [dstq+stride5q -16], m4, 3 + vextracti32x4 [dstq+stride5q ], m22, 3 + vextracti32x4 [dstq+stride3q*2-16], m5, 3 + vextracti32x4 [dstq+stride3q*2 ], m23, 3 + vextracti32x4 [dstq+stride7q -16], m6, 3 + vextracti32x4 [dstq+stride7q ], m28, 3 +%elif %1 == 8 + TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2 + movu [dstq+strideq*0 ], xm3 + movu [dstq+strideq*1 ], xm4 + movu [dstq+strideq*2 ], xm5 + movu [dstq+stride3q ], xm6 + movu [dstq+strideq*4 ], xm7 + movu [dstq+stride5q ], xm8 + movu [dstq+stride3q*2], xm9 + movu [dstq+stride7q ], xm10 + lea dstq, [dstq+strideq*8] + vextracti128 [dstq+strideq*0 ], ym3, 1 + vextracti128 [dstq+strideq*1 ], ym4, 1 + vextracti128 [dstq+strideq*2 ], ym5, 1 + vextracti128 [dstq+stride3q ], ym6, 1 + vextracti128 [dstq+strideq*4 ], ym7, 1 + vextracti128 [dstq+stride5q ], ym8, 1 + vextracti128 [dstq+stride3q*2], ym9, 1 + vextracti128 [dstq+stride7q ], ym10, 1 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 ], m3, 2 + vextracti32x4 [dstq+strideq*1 ], m4, 2 + vextracti32x4 [dstq+strideq*2 ], m5, 2 + vextracti32x4 [dstq+stride3q ], m6, 2 + vextracti32x4 [dstq+strideq*4 ], m7, 2 + vextracti32x4 [dstq+stride5q ], m8, 2 + vextracti32x4 [dstq+stride3q*2], m9, 2 + vextracti32x4 [dstq+stride7q ], m10, 2 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 ], m3, 3 + vextracti32x4 [dstq+strideq*1 ], m4, 3 + vextracti32x4 [dstq+strideq*2 ], m5, 3 + vextracti32x4 [dstq+stride3q ], m6, 3 + vextracti32x4 [dstq+strideq*4 ], m7, 3 + vextracti32x4 [dstq+stride5q ], m8, 3 + vextracti32x4 [dstq+stride3q*2], m9, 3 + vextracti32x4 [dstq+stride7q ], m10, 3 + lea dstq, [dstq+strideq*8+8] +%else ; %1 == 4 || %1 == 6 + punpcklwd m9, m5, m6 + punpckhwd m5, m6 + kxnorb k1, k1, k1 + punpcklwd m6, m7, m8 + punpckhwd m7, m8 + kmovb k2, k1 + punpckldq m8, m9, m6 + vpscatterdq [dstq+ym12-4]{k1}, m8 + punpckhdq m9, m6 + lea tmpq, [dstq+strideq*2-4] + kmovb k1, k2 + vpscatterdq [tmpq+ym12]{k2}, m9 + punpckldq m6, m5, m7 + lea tmpq, [tmpq+strideq*2] + kmovb k2, k1 + vpscatterdq [tmpq+ym12]{k1}, m6 + punpckhdq m5, m7 + lea tmpq, [tmpq+strideq*2] + vpscatterdq [tmpq+ym12]{k2}, m5 +%endif +%endif +%endmacro + +INIT_ZMM avx512icl +cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride, tmp, \ + mask_bits, stride5 +%define base tmpq-filter_mask + SWAP 12, 26 ; avoids clobbering xmm10 on WIN64 + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + lea stride3q, [strideq*3] + shl l_strideq, 2 + lea stride5q, [strideq*5] + shr r6d, 11 ; is_12bpc + mova ym21, [base+filter_mask] + mov mstrideq, strideq + vpbroadcastd m13, [base+pw_4+r6*8] + mov mask_bitsd, 0xff + vpbroadcastd m14, [base+clip_max+r6*4] + sub lq, l_strideq + vpbroadcastd m15, [base+clip_min+r6*4] + neg mstrideq + mov wd, wm +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + FILTER 16, v + jmp .end +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 8, v + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call .v4 +.end: + shl mask_bitsd, 8 + add dstq, 64 + pslld ym21, 8 + add lq, 32 + sub wd, 8 + jg .loop + RET +ALIGN function_align +.v4: ; called by both luma and chroma + FILTER 4, v + ret + +cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \ + lut, h, stride3, l_stride3, tmp, \ + mask_bits, stride5, stride7 + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + lea stride3q, [strideq*3] + vpbroadcastd ym12, strided + shl l_strideq, 2 + lea stride5q, [strideq*5] + shr r6d, 11 ; is_12bpc + pmulld ym12, [base+stride_mul] + lea stride7q, [strideq+stride3q*2] + mova ym21, [base+filter_mask] + mov mask_bitsd, 0xff + vpbroadcastd m13, [base+pw_4+r6*8] + sub lq, 4 + vpbroadcastd m14, [base+clip_max+r6*4] + lea l_stride3q, [l_strideq*3] + vpbroadcastd m15, [base+clip_min+r6*4] + mov hd, hm +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + FILTER 16, h + jmp .end +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 8, h + jmp .end2 +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + call .h4 +.no_filter: + lea dstq, [dstq+stride3q*8] +.end: + lea dstq, [dstq+strideq*8] +.end2: + shl mask_bitsd, 8 + pslld ym21, 8 + lea lq, [lq+l_strideq*8] + sub hd, 8 + jg .loop + RET +ALIGN function_align +.h4: ; called by both luma and chroma + FILTER 4, h + ret + +cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + shl l_strideq, 2 + lea stride3q, [strideq*3] + shr r6d, 11 ; is_12bpc + mova ym21, [base+filter_mask] + mov mstrideq, strideq + vpbroadcastd m13, [base+pw_4+r6*8] + mov mask_bitsd, 0xff + vpbroadcastd m14, [base+clip_max+r6*4] + sub lq, l_strideq + vpbroadcastd m15, [base+clip_min+r6*4] + neg mstrideq + mov wd, wm +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 6, v + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4 +.end: + shl mask_bitsd, 8 + add dstq, 64 + pslld ym21, 8 + add lq, 32 + sub wd, 8 + jg .loop + RET + +cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + vpbroadcastd ym12, strided + shl l_strideq, 2 + shr r6d, 11 ; is_12bpc + pmulld ym12, [base+stride_mul] + lea stride3q, [strideq*3] + mova ym21, [base+filter_mask] + mov mask_bitsd, 0xff + vpbroadcastd m13, [base+pw_4+r6*8] + sub lq, 4 + vpbroadcastd m14, [base+clip_max+r6*4] + lea l_stride3q, [l_strideq*3] + vpbroadcastd m15, [base+clip_min+r6*4] + mov hd, hm +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 6, h + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4 +.end: + lea tmpq, [strideq+stride3q] + shl mask_bitsd, 8 + pslld ym21, 8 + lea dstq, [dstq+tmpq*8] + lea lq, [lq+l_strideq*8] + sub hd, 8 + jg .loop + RET + +%endif ; ARCH_X86_64 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm index d6b296b19ef..84696c758ae 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm @@ -1444,7 +1444,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ cmp byte [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call .v4 .end: add lq, 32 @@ -1453,6 +1453,10 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ sub wd, 8 jg .loop RET +ALIGN function_align +.v4: + FILTER 4, v + ret INIT_YMM avx2 cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ @@ -1481,7 +1485,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ cmp byte [maskq+0], 0 ; vmask[0] je .no_filter - FILTER 4, h + call .h4 jmp .end .no_filter: @@ -1493,6 +1497,10 @@ cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ sub hd, 8 jg .loop RET +ALIGN function_align +.h4: + FILTER 4, h + ret INIT_YMM avx2 cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ @@ -1515,7 +1523,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ cmp byte [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4 .end: add lq, 32 @@ -1545,7 +1553,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \ cmp byte [maskq+0], 0 ; vmask[0] je .no_filter - FILTER 4, h + call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4 jmp .end .no_filter: diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm index c09dced418b..0218b624d3c 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm @@ -80,25 +80,24 @@ SECTION .text punpckhwd m%1, m%3 kmovw k1, k6 lea t0, [dstq+strideq*4] - vpscatterdd [dstq+m29-2]{k1}, m%4 + vpscatterdd [dstq+m19-2]{k1}, m%4 kmovw k1, k6 lea t1, [dstq+strideq*8] - vpscatterdd [t0 +m29-2]{k1}, m%5 + vpscatterdd [t0 +m19-2]{k1}, m%5 kmovw k1, k6 lea t2, [t0 +strideq*8] - vpscatterdd [t1 +m29-2]{k1}, m%2 + vpscatterdd [t1 +m19-2]{k1}, m%2 kmovw k1, k6 - vpscatterdd [t2 +m29-2]{k1}, m%1 + vpscatterdd [t2 +m19-2]{k1}, m%1 %endmacro %macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem %if %1 == 0 - SWAP m16, m15 + SWAP m16, m22 %endif - ; input in m0-15 - punpcklbw m15, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m2, m3 + punpcklbw m22, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 @@ -108,21 +107,21 @@ SECTION .text punpckhbw m8, m9 punpcklbw m9, m10, m11 punpckhbw m10, m11 - punpcklbw m11, m12, m13 - punpckhbw m12, m13 + punpcklbw m11, m25, m13 + punpckhbw m25, m13 %if %1 == 0 SWAP m13, m16 %else mova m13, %3 %endif - SWAP m16, m12 - punpcklbw m12, m14, m13 + SWAP m16, m25 + punpcklbw m25, m14, m13 punpckhbw m13, m14, m13 - ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 - punpcklwd m14, m15, m1 - punpckhwd m15, m1 - punpcklwd m1, m0, m2 - punpckhwd m0, m2 + ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13 + punpcklwd m14, m22, m26 + punpckhwd m22, m26 + punpcklwd m26, m24, m2 + punpckhwd m24, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 @@ -131,58 +130,58 @@ SECTION .text punpckhwd m7, m9 punpcklwd m9, m8, m10 punpckhwd m8, m10 - punpcklwd m10, m11, m12 - punpckhwd m11, m12 - SWAP m12, m16, m11 - punpcklwd m11, m12, m13 - punpckhwd m12, m13 - ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 + punpcklwd m10, m11, m25 + punpckhwd m11, m25 + SWAP m25, m16, m11 + punpcklwd m11, m25, m13 + punpckhwd m25, m13 + ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25 punpckldq m13, m14, m2 punpckhdq m14, m2 - punpckldq m2, m15, m3 - punpckhdq m15, m3 - punpckldq m3, m1, m5 - punpckhdq m1, m5 - punpckldq m5, m0, m4 - punpckhdq m0, m4 + punpckldq m2, m22, m3 + punpckhdq m22, m3 + punpckldq m3, m26, m5 + punpckhdq m26, m5 + punpckldq m5, m24, m4 + punpckhdq m24, m4 punpckldq m4, m6, m10 punpckhdq m6, m10 punpckldq m10, m9, m11 punpckhdq m9, m11 - punpckldq m11, m8, m12 - punpckhdq m8, m12 - SWAP m12, m16, m8 - punpckldq m8, m7, m12 - punpckhdq m7, m12 - ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 - punpcklqdq m12, m13, m4 + punpckldq m11, m8, m25 + punpckhdq m8, m25 + SWAP m25, m16, m8 + punpckldq m8, m7, m25 + punpckhdq m7, m25 + ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3 + punpcklqdq m25, m13, m4 punpckhqdq m13, m4 punpcklqdq m4, m14, m6 punpckhqdq m14, m6 punpcklqdq m6, m2, m8 punpckhqdq m2, m8 - punpcklqdq m8, m15, m7 - punpckhqdq m15, m7 + punpcklqdq m8, m22, m7 + punpckhqdq m22, m7 punpcklqdq m7, m3, m10 punpckhqdq m3, m10 - punpcklqdq m10, m1, m9 - punpckhqdq m1, m9 + punpcklqdq m10, m26, m9 + punpckhqdq m26, m9 punpcklqdq m9, m5, m11 punpckhqdq m5, m11 SWAP m11, m16 %if %2 == 0 - SWAP m16, m12 + SWAP m16, m25 %else - mova %3, m12 + mova %3, m25 %endif - punpcklqdq m12, m0, m11 - punpckhqdq m0, m11 + punpcklqdq m25, m24, m11 + punpckhqdq m24, m11 %if %2 == 0 SWAP m11, m16 %endif - ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 - SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 - SWAP 3, 14, 12, 9 + ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24 + SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22 + SWAP 3, 14, 25, 9 %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] @@ -205,7 +204,7 @@ SECTION .text %endif lea t0, [dstq+mstrideq*4] %if %1 != 6 - mova m12, [t0 +strideq*0] + mova m25, [t0 +strideq*0] %endif mova m13, [t0 +strideq*1] mova m3, [t0 +strideq*2] @@ -214,13 +213,13 @@ SECTION .text mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 - mova m15, [dstq+stride3q ] + mova m22, [dstq+stride3q ] %endif %if %1 == 16 lea t0, [dstq+strideq*4] - mova m19, [t0 +strideq*0] - mova m20, [t0 +strideq*1] - mova m21, [t0 +strideq*2] + mova m29, [t0 +strideq*0] + mova m30, [t0 +strideq*1] + mova m31, [t0 +strideq*2] %endif %endif %else ; h @@ -230,15 +229,15 @@ SECTION .text vbroadcasti32x4 m0, [hshuf4] kmovw k1, k6 lea t0, [dstq+strideq*4] - vpgatherdd m3{k1}, [dstq+m29-2] + vpgatherdd m3{k1}, [dstq+m19-2] kmovw k1, k6 lea t1, [dstq+strideq*8] - vpgatherdd m4{k1}, [t0 +m29-2] + vpgatherdd m4{k1}, [t0 +m19-2] kmovw k1, k6 lea t2, [t0 +strideq*8] - vpgatherdd m5{k1}, [t1 +m29-2] + vpgatherdd m5{k1}, [t1 +m19-2] kmovw k1, k6 - vpgatherdd m6{k1}, [t2 +m29-2] + vpgatherdd m6{k1}, [t2 +m19-2] pshufb m3, m0 pshufb m4, m0 pshufb m5, m0 @@ -257,16 +256,16 @@ SECTION .text %elif %1 == 6 || %1 == 8 kmovb k1, k7 lea t0, [dstq+strideq*1] - vpgatherdq m3{k1}, [dstq+ym31-%1/2] + vpgatherdq m3{k1}, [dstq+ym21-%1/2] kmovb k1, k7 lea t1, [dstq+strideq*2] - vpgatherdq m4{k1}, [t0 +ym31-%1/2] + vpgatherdq m4{k1}, [t0 +ym21-%1/2] kmovb k1, k7 lea t2, [dstq+stride3q ] - vpgatherdq m5{k1}, [t1 +ym31-%1/2] + vpgatherdq m5{k1}, [t1 +ym21-%1/2] kmovb k1, k7 - vextracti32x8 ym0, m31, 1 - vpgatherdq m6{k1}, [t2 +ym31-%1/2] + vextracti32x8 ym0, m21, 1 + vpgatherdq m6{k1}, [t2 +ym21-%1/2] kmovb k1, k7 vpgatherdq m12{k1}, [dstq+ym0 -%1/2] kmovb k1, k7 @@ -344,7 +343,7 @@ SECTION .text punpckhqdq m13, m5, m13 %if %1 == 8 punpcklqdq m5, m7, m12 - punpckhqdq m12, m7, m12 + punpckhqdq m25, m7, m12 ; xm3: A0-15 ; xm14: B0-15 ; xm15: C0-15 @@ -352,10 +351,11 @@ SECTION .text ; xm4: E0-15 ; xm13: F0-15 ; xm5: G0-15 - ; xm12: H0-15 - SWAP 12, 3, 15 + ; xm25: H0-15 + SWAP 25, 3, 15 SWAP 13, 14, 5, 4, 6 - ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 + SWAP 15, 22 + ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22 %else SWAP 13, 3, 14 SWAP 6, 4, 15, 5 @@ -364,8 +364,8 @@ SECTION .text %else ; 16, h ; load and 16x16 transpose. We only use 14 pixels but we'll need the ; remainder at the end for the second transpose - movu xm0, [dstq+strideq*0-8] - movu xm1, [dstq+strideq*1-8] + movu xm24, [dstq+strideq*0-8] + movu xm26, [dstq+strideq*1-8] movu xm2, [dstq+strideq*2-8] movu xm3, [dstq+stride3q -8] lea t0, [dstq+strideq*4] @@ -379,13 +379,13 @@ SECTION .text movu xm10, [t0 +strideq*2-8] movu xm11, [t0 +stride3q -8] lea t0, [t0 +strideq*4] - movu xm12, [t0 +strideq*0-8] + movu xm25, [t0 +strideq*0-8] movu xm13, [t0 +strideq*1-8] movu xm14, [t0 +strideq*2-8] - movu xm15, [t0 +stride3q -8] + movu xm22, [t0 +stride3q -8] lea t0, [t0 +strideq*4] - vinserti32x4 ym0, [t0 +strideq*0-8], 1 - vinserti32x4 ym1, [t0 +strideq*1-8], 1 + vinserti32x4 ym24, [t0 +strideq*0-8], 1 + vinserti32x4 ym26, [t0 +strideq*1-8], 1 vinserti32x4 ym2, [t0 +strideq*2-8], 1 vinserti32x4 ym3, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] @@ -399,13 +399,13 @@ SECTION .text vinserti32x4 ym10, [t0 +strideq*2-8], 1 vinserti32x4 ym11, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] - vinserti32x4 ym12, [t0 +strideq*0-8], 1 + vinserti32x4 ym25, [t0 +strideq*0-8], 1 vinserti32x4 ym13, [t0 +strideq*1-8], 1 vinserti32x4 ym14, [t0 +strideq*2-8], 1 - vinserti32x4 ym15, [t0 +stride3q -8], 1 + vinserti32x4 ym22, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] - vinserti32x4 m0, [t0 +strideq*0-8], 2 - vinserti32x4 m1, [t0 +strideq*1-8], 2 + vinserti32x4 m24, [t0 +strideq*0-8], 2 + vinserti32x4 m26, [t0 +strideq*1-8], 2 vinserti32x4 m2, [t0 +strideq*2-8], 2 vinserti32x4 m3, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] @@ -419,13 +419,13 @@ SECTION .text vinserti32x4 m10, [t0 +strideq*2-8], 2 vinserti32x4 m11, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] - vinserti32x4 m12, [t0 +strideq*0-8], 2 + vinserti32x4 m25, [t0 +strideq*0-8], 2 vinserti32x4 m13, [t0 +strideq*1-8], 2 vinserti32x4 m14, [t0 +strideq*2-8], 2 - vinserti32x4 m15, [t0 +stride3q -8], 2 + vinserti32x4 m22, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] - vinserti32x4 m0, [t0 +strideq*0-8], 3 - vinserti32x4 m1, [t0 +strideq*1-8], 3 + vinserti32x4 m24, [t0 +strideq*0-8], 3 + vinserti32x4 m26, [t0 +strideq*1-8], 3 vinserti32x4 m2, [t0 +strideq*2-8], 3 vinserti32x4 m3, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] @@ -439,41 +439,38 @@ SECTION .text vinserti32x4 m10, [t0 +strideq*2-8], 3 vinserti32x4 m11, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] - vinserti32x4 m12, [t0 +strideq*0-8], 3 + vinserti32x4 m25, [t0 +strideq*0-8], 3 vinserti32x4 m13, [t0 +strideq*1-8], 3 vinserti32x4 m14, [t0 +strideq*2-8], 3 - vinserti32x4 m15, [t0 +stride3q -8], 3 + vinserti32x4 m22, [t0 +stride3q -8], 3 ; TRANSPOSE_16X16B 0, 1, [rsp+0*64] - SWAP m16, m1 + SWAP m16, m26 SWAP m17, m2 SWAP m18, m3 - SWAP m19, m12 - SWAP m20, m13 - SWAP m21, m14 - mova [rsp+4*64], m15 - ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 - SWAP 12, 4, 7 + SWAP m29, m25 + SWAP m30, m13 + SWAP m31, m14 + mova [rsp+4*64], m22 + ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22 + SWAP 25, 4, 7 SWAP 13, 5, 8 SWAP 3, 6, 9 SWAP 10, 14 - SWAP 11, 15 + SWAP 11, 22 %endif %endif ; load L/E/I/H -%if is_uv - SWAP m22, m15 -%endif - vpbroadcastd m22, [pb_1] + vpbroadcastd m15, [pb_1] %ifidn %2, v movu m1, [lq] movu m0, [lq+l_strideq] %else kmovw k1, k6 - vpgatherdd m0{k1}, [lq+m30+4] + vpgatherdd m0{k1}, [lq+m20+4] kmovw k1, k6 - vpgatherdd m1{k1}, [lq+m30+0] + vpgatherdd m1{k1}, [lq+m20+0] %endif pxor m2, m2 pcmpeqb k1, m0, m2 @@ -484,7 +481,7 @@ SECTION .text pand m2, [pb_63]{bcstd} vpbroadcastb m1, [lutq+136] pminub m2, m1 - pmaxub m2, m22 ; I + pmaxub m2, m15 ; I pand m1, m0, [pb_240]{bcstd} psrlq m1, 4 ; H paddd m0, [pb_2]{bcstd} @@ -500,7 +497,7 @@ SECTION .text ABSSUB m9, m13, m4, m10 ; abs(p2-p0) pmaxub m9, m8 %else - ABSSUB m9, m12, m4, m10 ; abs(p3-p0) + ABSSUB m9, m25, m4, m10 ; abs(p3-p0) pmaxub m9, m8 ABSSUB m10, m13, m4, m11 ; abs(p2-p0) pmaxub m9, m10 @@ -508,17 +505,17 @@ SECTION .text ABSSUB m10, m5, m14, m11 ; abs(q2-q0) pmaxub m9, m10 %if %1 != 6 - ABSSUB m10, m5, m15, m11 ; abs(q3-q0) + ABSSUB m10, m5, m22, m11 ; abs(q3-q0) pmaxub m9, m10 %endif - vpcmpub k2{k3}, m9, m22, 2 ; le ; flat8in + vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in %if %1 == 6 ABSSUB m10, m13, m3, m1 ; abs(p2-p1) %else - ABSSUB m10, m12, m13, m11 ; abs(p3-p2) + ABSSUB m10, m25, m13, m11 ; abs(p3-p2) ABSSUB m11, m13, m3, m1 ; abs(p2-p1) pmaxub m10, m11 - ABSSUB m11, m14, m15, m1 ; abs(q3-q2) + ABSSUB m11, m14, m22, m1 ; abs(q3-q2) pmaxub m10, m11 %endif ABSSUB m11, m14, m6, m1 ; abs(q2-q1) @@ -526,16 +523,10 @@ SECTION .text %if %1 == 16 vpbroadcastd m11, [maskq+8] por m11, [maskq+4]{bcstd} - pand m11, pbmask %else - %if !is_h || %1 == 6 - pand m11, pbmask, [maskq+4]{bcstd} - %else vpbroadcastd m11, [maskq+4] - pand m11, pbmask - %endif %endif - pcmpeqd k4, m11, pbmask + vptestmd k4, m11, pbmask vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks pmaxub m8, m10 %endif @@ -554,77 +545,58 @@ SECTION .text pmaxub m1, m2 ABSSUB m2, m18, m4, m10 pmaxub m1, m2 - ABSSUB m2, m19, m5, m10 + ABSSUB m2, m29, m5, m10 pmaxub m1, m2 - ABSSUB m2, m20, m5, m10 + ABSSUB m2, m30, m5, m10 pmaxub m1, m2 - ABSSUB m2, m21, m5, m10 + ABSSUB m2, m31, m5, m10 pmaxub m1, m2 - ; - vpcmpub k4, m1, m22, 2 ; flat8out - kandq k4, k4, k2 ; flat8in & flat8out - + kandq k2, k2, k3 + vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out vpbroadcastd m2, [maskq+8] - pand m10, m2, pbmask - pcmpeqd k5, m10, pbmask + vptestmd k5, m2, pbmask vpmovm2d m7, k5 - vpmovb2m k5, m7 - kandq k4, k4, k5 ; flat16 - kandq k4, k3, k4 ; flat16 & fm + vptestmb k4{k4}, m7, m7 ; flat16 & fm por m10, m2, [maskq+4]{bcstd} - pand m2, m10, pbmask - pcmpeqd k5, m2, pbmask + vptestmd k5, m10, pbmask vpmovm2d m7, k5 - vpmovb2m k5, m7 - kandq k2, k2, k5 ; flat8in - kandq k2, k3, k2 + vptestmb k2{k2}, m7, m7 ; flat8in por m2, m10, [maskq+0]{bcstd} - pand m2, pbmask - pcmpeqd k5, m2, pbmask + vptestmd k5, m2, pbmask vpmovm2d m7, k5 - vpmovb2m k5, m7 - kandq k3, k3, k5 + vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 & !flat16 kandnq k2, k4, k2 ; flat8 & !flat16 %elif %1 != 4 vpbroadcastd m0, [maskq+4] - pand m2, m0, pbmask - pcmpeqd k4, m2, pbmask + vptestmd k4, m0, pbmask vpmovm2d m7, k4 - vpmovb2m k4, m7 - kandq k2, k2, k4 + vptestmb k2{k2}, m7, m7 kandq k2, k2, k3 ; flat8 & fm por m0, [maskq+0]{bcstd} - pand m0, pbmask - pcmpeqd k4, m0, pbmask + vptestmd k4, m0, pbmask vpmovm2d m7, k4 - vpmovb2m k4, m7 - kandq k3, k3, k4 + vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 %else %ifidn %2, v - pand m0, pbmask, [maskq+0]{bcstd} + vptestmd k4, pbmask, [maskq+0]{bcstd} %else vpbroadcastd m0, [maskq+0] - pand m0, pbmask + vptestmd k4, m0, pbmask %endif - pcmpeqd k4, m0, pbmask vpmovm2d m7, k4 - vpmovb2m k4, m7 - kandq k3, k3, k4 ; fm + vptestmb k3{k3}, m7, m7 ; fm %endif ; short filter -%if is_uv - SWAP m23, m22 - SWAP m24, m0 - SWAP m25, m12 - SWAP m26, m1 +%if %1 >= 8 + SWAP m23, m15 %endif - vpbroadcastd m23, [pb_3] - vpbroadcastd m24, [pb_4] - vpbroadcastd m25, [pb_16] - vpbroadcastd m26, [pb_64] + vpbroadcastd m15, [pb_3] + vpbroadcastd m0, [pb_4] + vpbroadcastd m12, [pb_16] + vpbroadcastd m1, [pb_64] pxor m3, pb128 pxor m6, pb128 psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev @@ -634,16 +606,16 @@ SECTION .text paddsb m10, m11 paddsb m10, m11 paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm - paddsb m8, m10, m23 - paddsb m10, m24 + paddsb m8, m10, m15 + paddsb m10, m0 pand m8, [pb_248]{bcstd} pand m10, [pb_248]{bcstd} psrlq m8, 3 psrlq m10, 3 - pxor m8, m25 - pxor m10, m25 - psubb m8, m25 ; f2 - psubb m10, m25 ; f1 + pxor m8, m12 + pxor m10, m12 + psubb m8, m12 ; f2 + psubb m10, m12 ; f1 paddsb m4, m8 psubsb m5, m10 pxor m4, pb128 @@ -652,7 +624,7 @@ SECTION .text pxor m10, pb128 pxor m8, m8 pavgb m8, m10 ; f=(f1+1)>>1 - psubb m8, m26 + psubb m8, m1 knotq k1, k1 paddsb m3{k1}, m3, m8 psubsb m6{k1}, m6, m8 @@ -664,40 +636,40 @@ SECTION .text %ifidn %2, v lea t0, [dstq+mstrideq*8] %endif - SWAP m0, m16, m14 - SWAP m2, m17, m15 + SWAP m24, m16, m14 + SWAP m2, m17, m22 SWAP m7, m18 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A ; write -6 - vpbroadcastd m26, [pb_7_1] - vpbroadcastd m25, [pb_2] - punpcklbw m14, m0, m12 - punpckhbw m15, m0, m12 - pmaddubsw m10, m14, m26 - pmaddubsw m11, m15, m26 ; p6*7+p3 + vpbroadcastd m1, [pb_7_1] + vpbroadcastd m12, [pb_2] + punpcklbw m14, m24, m25 + punpckhbw m22, m24, m25 + pmaddubsw m10, m14, m1 + pmaddubsw m11, m22, m1 ; p6*7+p3 punpcklbw m8, m2, m7 punpckhbw m9, m2, m7 - pmaddubsw m8, m25 - pmaddubsw m9, m25 + pmaddubsw m8, m12 + pmaddubsw m9, m12 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3 %ifidn %2, h vpbroadcastd m27, [pw_2048] - vpbroadcastd m26, [pb_m1_1] + vpbroadcastd m1, [pb_m1_1] %define pw2048 m27 - %define pbm1_1 m26 + %define pbm1_1 m1 %endif punpcklbw m8, m13, m3 punpckhbw m9, m13, m3 - pmaddubsw m8, m22 - pmaddubsw m9, m22 + pmaddubsw m8, m23 + pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 punpcklbw m8, m4, m5 punpckhbw m9, m4, m5 - pmaddubsw m8, m22 - pmaddubsw m9, m22 + pmaddubsw m8, m23 + pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 pmulhrsw m8, m10, pw2048 @@ -713,17 +685,17 @@ SECTION .text ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B ; write -5 pmaddubsw m14, pbm1_1 - pmaddubsw m15, pbm1_1 + pmaddubsw m22, pbm1_1 paddw m10, m14 - paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 - punpcklbw m8, m0, m6 - punpckhbw m9, m0, m6 + paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m8, m24, m6 + punpckhbw m9, m24, m6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 SWAP m18, m8 - SWAP m22, m9 + SWAP m23, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 @@ -737,8 +709,8 @@ SECTION .text ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C ; write -4 SWAP m14, m16 - punpcklbw m8, m0, m13 - punpckhbw m9, m0, m13 + punpcklbw m8, m24, m13 + punpckhbw m9, m24, m13 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 @@ -756,21 +728,21 @@ SECTION .text %ifidn %2, v vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3 %else - vpblendmb m8{k4}, m12, m8 + vpblendmb m8{k4}, m25, m8 mova [rsp+3*64], m8 %endif ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D ; write -3 - SWAP m15, m17 - punpcklbw m8, m0, m3 - punpckhbw m9, m0, m3 + SWAP m22, m17 + punpcklbw m8, m24, m3 + punpckhbw m9, m24, m3 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 - punpcklbw m8, m7, m15 - punpckhbw m7, m15 + punpcklbw m8, m7, m22 + punpckhbw m7, m22 pmaddubsw m8, pbm1_1 pmaddubsw m7, pbm1_1 paddw m10, m8 @@ -779,69 +751,69 @@ SECTION .text pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 - vpblendmb m23{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F + vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E ; write -2 %ifidn %2, v lea t0, [dstq+strideq*4] %endif - punpcklbw m8, m0, m4 - punpckhbw m9, m0, m4 + punpcklbw m8, m24, m4 + punpckhbw m9, m24, m4 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 - punpcklbw m8, m12, m19 - punpckhbw m9, m12, m19 - SWAP m1, m19 + punpcklbw m8, m25, m29 + punpckhbw m9, m25, m29 + SWAP m26, m29 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 - SWAP m19, m8 - SWAP m24, m9 + SWAP m29, m8 + SWAP m0, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 - vpblendmb m25{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G + vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F ; write -1 %ifidn %2, h - SWAP m28, m0 + SWAP m28, m24 punpcklbw m8, m28, m5 - punpckhbw m0, m28, m5 + punpckhbw m24, m28, m5 %else - punpcklbw m8, m0, m5 - punpckhbw m0, m5 + punpcklbw m8, m24, m5 + punpckhbw m24, m5 %endif pmaddubsw m8, pbm1_1 - pmaddubsw m0, pbm1_1 + pmaddubsw m24, pbm1_1 paddw m10, m8 - paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 - punpcklbw m0, m13, m20 - punpckhbw m9, m13, m20 + paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m24, m13, m30 + punpckhbw m9, m13, m30 %ifidn %2, h - SWAP m27, m20 + SWAP m27, m30 %endif - SWAP m13, m23 - pmaddubsw m0, pbm1_1 + SWAP m13, m15 + pmaddubsw m24, pbm1_1 pmaddubsw m9, pbm1_1 - paddw m10, m0 + paddw m10, m24 paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 - SWAP m20, m0 - SWAP m23, m9 + SWAP m30, m24 + SWAP m15, m9 %ifidn %2, h - SWAP m9, m0 + SWAP m9, m24 %define pw2048 m9 %endif - pmulhrsw m0, m10, pw2048 + pmulhrsw m24, m10, pw2048 pmulhrsw m8, m11, pw2048 paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 - paddw m11, m22 - packuswb m0, m8 - punpcklbw m8, m3, m21 + paddw m11, m23 + packuswb m24, m8 + punpcklbw m8, m3, m31 pmaddubsw m8, pbm1_1 paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 SWAP m18, m8 @@ -851,34 +823,34 @@ SECTION .text SWAP m16, m9 %define pw2048 m16 %endif - punpckhbw m9, m3, m21 - SWAP m3, m25 + punpckhbw m9, m3, m31 + SWAP m3, m12 pmaddubsw m9, pbm1_1 paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 - SWAP m22, m9 + SWAP m23, m9 pmulhrsw m9, m11, pw2048 paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 %ifidn %2, h - SWAP m2, m26 + SWAP m2, m1 %define pbm1_1 m2 %endif - vpblendmb m26{k4}, m4, m0 ; don't clobber p0/m4 since we need it in H + vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G ; write +0 - SWAP m0, m21 ; q6 + SWAP m24, m31 ; q6 packuswb m8, m9 %ifidn %2, h - SWAP m21, m2 - %define pbm1_1 m21 + SWAP m31, m2 + %define pbm1_1 m31 %endif - vpblendmb m25{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I + vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H ; write +1 - punpcklbw m8, m4, m0 - punpckhbw m2, m4, m0 - SWAP m4, m26 + punpcklbw m8, m4, m24 + punpckhbw m2, m4, m24 + SWAP m4, m1 pmaddubsw m8, pbm1_1 pmaddubsw m2, pbm1_1 paddw m10, m8 @@ -892,9 +864,9 @@ SECTION .text ; write +2 paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 paddw m11, m7 - punpcklbw m8, m5, m0 - punpckhbw m9, m5, m0 - SWAP m5, m25 + punpcklbw m8, m5, m24 + punpckhbw m9, m5, m24 + SWAP m5, m12 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 @@ -906,10 +878,10 @@ SECTION .text ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 - paddw m10, m19 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 - paddw m11, m24 - punpcklbw m8, m6, m0 - punpckhbw m9, m6, m0 + paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + paddw m11, m0 + punpcklbw m8, m6, m24 + punpckhbw m9, m6, m24 SWAP 2, 6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 @@ -921,20 +893,20 @@ SECTION .text %ifidn %2, v vmovdqu8 [t0+mstrideq]{k4}, m8 %else - SWAP m19, m16 - %define pw2048 m19 - vpblendmb m16{k4}, m15, m8 + SWAP m29, m16 + %define pw2048 m29 + vpblendmb m16{k4}, m22, m8 %endif ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K ; write +4 - paddw m10, m20 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - paddw m11, m23 + paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + paddw m11, m15 %ifidn %2, h - SWAP m23, m8 + SWAP m15, m8 %endif - punpcklbw m8, m14, m0 - punpckhbw m9, m14, m0 + punpcklbw m8, m14, m24 + punpckhbw m9, m14, m24 SWAP 14, 7 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 @@ -946,16 +918,16 @@ SECTION .text %ifidn %2, v vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4 %else - vpblendmb m17{k4}, m1, m8 + vpblendmb m17{k4}, m26, m8 %endif ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L ; write +5 paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - paddw m11, m22 - punpcklbw m8, m15, m0 - punpckhbw m9, m15, m0 - SWAP m20, m0 + paddw m11, m23 + punpcklbw m8, m22, m24 + punpckhbw m9, m22, m24 + SWAP m30, m24 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 @@ -979,26 +951,26 @@ SECTION .text vpbroadcastd m9, [pb_3_1] vpbroadcastd m10, [pb_2_1] %if %1 == 16 - vpbroadcastd m22, [pb_1] - vpbroadcastd m24, [pb_4] + vpbroadcastd m23, [pb_1] + vpbroadcastd m0, [pb_4] %elifidn %2, h - vpbroadcastd m21, [pb_m1_1] - %define pbm1_1 m21 + vpbroadcastd m31, [pb_m1_1] + %define pbm1_1 m31 %endif - punpcklbw m0, m12, m3 - punpckhbw m1, m12, m3 - pmaddubsw m2, m0, m9 - pmaddubsw m7, m1, m9 ; 3 * p3 + p1 + punpcklbw m24, m25, m3 + punpckhbw m26, m25, m3 + pmaddubsw m2, m24, m9 + pmaddubsw m7, m26, m9 ; 3 * p3 + p1 punpcklbw m8, m13, m4 punpckhbw m11, m13, m4 pmaddubsw m8, m10 pmaddubsw m11, m10 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 - punpcklbw m8, m5, m24 - punpckhbw m11, m5, m24 - pmaddubsw m8, m22 - pmaddubsw m11, m22 + punpcklbw m8, m5, m0 + punpckhbw m11, m5, m0 + pmaddubsw m8, m23 + pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 psrlw m8, m2, 3 @@ -1015,8 +987,8 @@ SECTION .text %endif %endif - pmaddubsw m8, m0, pbm1_1 - pmaddubsw m11, m1, pbm1_1 + pmaddubsw m8, m24, pbm1_1 + pmaddubsw m11, m26, pbm1_1 paddw m2, m8 paddw m7, m11 punpcklbw m8, m13, m6 @@ -1035,14 +1007,14 @@ SECTION .text SWAP m18, m8 %endif - pmaddubsw m0, m22 - pmaddubsw m1, m22 - psubw m2, m0 - psubw m7, m1 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 punpcklbw m8, m4, m14 punpckhbw m11, m4, m14 - pmaddubsw m8, m22 - pmaddubsw m11, m22 + pmaddubsw m8, m23 + pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 psrlw m8, m2, 3 @@ -1052,19 +1024,19 @@ SECTION .text %ifidn %2, v mova [t0+stride3q], m8 %else - SWAP m19, m8 + SWAP m29, m8 %endif - punpcklbw m0, m5, m15 - punpckhbw m1, m5, m15 - pmaddubsw m8, m0, m22 - pmaddubsw m11, m1, m22 + punpcklbw m24, m5, m22 + punpckhbw m26, m5, m22 + pmaddubsw m8, m24, m23 + pmaddubsw m11, m26, m23 paddw m2, m8 paddw m7, m11 - punpcklbw m8, m4, m12 - punpckhbw m11, m4, m12 - pmaddubsw m8, m22 - pmaddubsw m11, m22 + punpcklbw m8, m4, m25 + punpckhbw m11, m4, m25 + pmaddubsw m8, m23 + pmaddubsw m11, m23 psubw m2, m8 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 psrlw m8, m2, 3 @@ -1075,10 +1047,10 @@ SECTION .text mova [dstq+strideq*0], m11 %endif - pmaddubsw m0, pbm1_1 - pmaddubsw m1, pbm1_1 - paddw m2, m0 - paddw m7, m1 + pmaddubsw m24, pbm1_1 + pmaddubsw m26, pbm1_1 + paddw m2, m24 + paddw m7, m26 punpcklbw m8, m13, m6 punpckhbw m13, m6 pmaddubsw m8, pbm1_1 @@ -1093,18 +1065,18 @@ SECTION .text mova [dstq+strideq*1], m13 %endif - punpcklbw m0, m3, m6 - punpckhbw m1, m3, m6 - pmaddubsw m0, m22 - pmaddubsw m1, m22 - psubw m2, m0 - psubw m7, m1 - punpcklbw m0, m14, m15 - punpckhbw m1, m14, m15 - pmaddubsw m0, m22 - pmaddubsw m1, m22 - paddw m2, m0 - paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + punpcklbw m24, m3, m6 + punpckhbw m26, m3, m6 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 + punpcklbw m24, m14, m22 + punpckhbw m26, m14, m22 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + paddw m2, m24 + paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 @@ -1120,36 +1092,36 @@ SECTION .text %endif %ifidn %2, h - SWAP m0, m18 - SWAP m1, m19 + SWAP m24, m18 + SWAP m26, m29 %if %1 == 8 ; 16x8 transpose - punpcklbw m3, m12, m10 - punpckhbw m12, m10 - punpcklbw m10, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m11, m13 + punpcklbw m3, m25, m10 + punpckhbw m25, m10 + punpcklbw m10, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m11, m13 punpckhbw m11, m13 - punpcklbw m13, m2, m15 - punpckhbw m2, m15 + punpcklbw m13, m2, m22 + punpckhbw m2, m22 ; - punpcklwd m15, m3, m10 + punpcklwd m22, m3, m10 punpckhwd m3, m10 - punpcklwd m10, m12, m0 - punpckhwd m12, m0 - punpcklwd m0, m1, m13 - punpckhwd m1, m13 + punpcklwd m10, m25, m24 + punpckhwd m25, m24 + punpcklwd m24, m26, m13 + punpckhwd m26, m13 punpcklwd m13, m11, m2 punpckhwd m11, m2 ; - punpckldq m2, m15, m0 - punpckhdq m15, m0 - punpckldq m0, m3, m1 - punpckhdq m3, m1 - punpckldq m1, m10, m13 + punpckldq m2, m22, m24 + punpckhdq m22, m24 + punpckldq m24, m3, m26 + punpckhdq m3, m26 + punpckldq m26, m10, m13 punpckhdq m10, m13 - punpckldq m13, m12, m11 - punpckhdq m12, m11 + punpckldq m13, m25, m11 + punpckhdq m25, m11 ; write 8x32 vpbroadcastd ym16, strided pmulld ym16, [hmulD] @@ -1162,8 +1134,8 @@ SECTION .text kmovb k3, k6 kmovb k4, k6 vpscatterdq [dstq+ym16-4]{k1}, m2 - vpscatterdq [t1 +ym16-4]{k2}, m15 - vpscatterdq [t2 +ym16-4]{k3}, m0 + vpscatterdq [t1 +ym16-4]{k2}, m22 + vpscatterdq [t2 +ym16-4]{k3}, m24 vpscatterdq [t3 +ym16-4]{k4}, m3 lea t1, [t0+strideq*2] lea t2, [t0+strideq*4] @@ -1172,29 +1144,29 @@ SECTION .text kmovb k2, k6 kmovb k3, k6 kmovb k4, k6 - vpscatterdq [t0+ym16-4]{k1}, m1 + vpscatterdq [t0+ym16-4]{k1}, m26 vpscatterdq [t1+ym16-4]{k2}, m10 vpscatterdq [t2+ym16-4]{k3}, m13 - vpscatterdq [t3+ym16-4]{k4}, m12 + vpscatterdq [t3+ym16-4]{k4}, m25 %else ; 16x16 transpose and store SWAP 5, 10, 2 - SWAP 6, 0 - SWAP 7, 1 + SWAP 6, 24 + SWAP 7, 26 SWAP 8, 11 SWAP 9, 13 - mova m0, [rsp+0*64] - SWAP m1, m28 + mova m24, [rsp+0*64] + SWAP m26, m28 mova m2, [rsp+1*64] mova m3, [rsp+2*64] mova m4, [rsp+3*64] SWAP m11, m16 - SWAP m12, m17 + SWAP m25, m17 SWAP m13, m27 - SWAP m14, m20 + SWAP m14, m30 TRANSPOSE_16X16B 1, 0, [rsp+4*64] - movu [dstq+strideq*0-8], xm0 - movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*0-8], xm24 + movu [dstq+strideq*1-8], xm26 movu [dstq+strideq*2-8], xm2 movu [dstq+stride3q -8], xm3 lea t0, [dstq+strideq*4] @@ -1208,13 +1180,13 @@ SECTION .text movu [t0+strideq*2-8], xm10 movu [t0+stride3q -8], xm11 lea t0, [t0+strideq*4] - movu [t0+strideq*0-8], xm12 + movu [t0+strideq*0-8], xm25 movu [t0+strideq*1-8], xm13 movu [t0+strideq*2-8], xm14 - movu [t0+stride3q -8], xm15 + movu [t0+stride3q -8], xm22 lea t0, [t0+strideq*4] - vextracti128 [t0+strideq*0-8], ym0, 1 - vextracti128 [t0+strideq*1-8], ym1, 1 + vextracti128 [t0+strideq*0-8], ym24, 1 + vextracti128 [t0+strideq*1-8], ym26, 1 vextracti128 [t0+strideq*2-8], ym2, 1 vextracti128 [t0+stride3q -8], ym3, 1 lea t0, [t0+strideq*4] @@ -1228,13 +1200,13 @@ SECTION .text vextracti128 [t0+strideq*2-8], ym10, 1 vextracti128 [t0+stride3q -8], ym11, 1 lea t0, [t0+strideq*4] - vextracti128 [t0+strideq*0-8], ym12, 1 + vextracti128 [t0+strideq*0-8], ym25, 1 vextracti128 [t0+strideq*1-8], ym13, 1 vextracti128 [t0+strideq*2-8], ym14, 1 - vextracti128 [t0+stride3q -8], ym15, 1 + vextracti128 [t0+stride3q -8], ym22, 1 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m0, 2 - vextracti32x4 [t0+strideq*1-8], m1, 2 + vextracti32x4 [t0+strideq*0-8], m24, 2 + vextracti32x4 [t0+strideq*1-8], m26, 2 vextracti32x4 [t0+strideq*2-8], m2, 2 vextracti32x4 [t0+stride3q -8], m3, 2 lea t0, [t0+strideq*4] @@ -1248,13 +1220,13 @@ SECTION .text vextracti32x4 [t0+strideq*2-8], m10, 2 vextracti32x4 [t0+stride3q -8], m11, 2 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m12, 2 + vextracti32x4 [t0+strideq*0-8], m25, 2 vextracti32x4 [t0+strideq*1-8], m13, 2 vextracti32x4 [t0+strideq*2-8], m14, 2 - vextracti32x4 [t0+stride3q -8], m15, 2 + vextracti32x4 [t0+stride3q -8], m22, 2 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m0, 3 - vextracti32x4 [t0+strideq*1-8], m1, 3 + vextracti32x4 [t0+strideq*0-8], m24, 3 + vextracti32x4 [t0+strideq*1-8], m26, 3 vextracti32x4 [t0+strideq*2-8], m2, 3 vextracti32x4 [t0+stride3q -8], m3, 3 lea t0, [t0+strideq*4] @@ -1268,19 +1240,15 @@ SECTION .text vextracti32x4 [t0+strideq*2-8], m10, 3 vextracti32x4 [t0+stride3q -8], m11, 3 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m12, 3 + vextracti32x4 [t0+strideq*0-8], m25, 3 vextracti32x4 [t0+strideq*1-8], m13, 3 vextracti32x4 [t0+strideq*2-8], m14, 3 - vextracti32x4 [t0+stride3q -8], m15, 3 + vextracti32x4 [t0+stride3q -8], m22, 3 %endif %endif %elif %1 == 6 ; flat6 filter - SWAP m15, m23 - SWAP m0, m24 - SWAP m12, m25 - SWAP m1, m26 vpbroadcastd m15, [pb_3_1] vpbroadcastd m12, [pb_2] punpcklbw m8, m13, m5 @@ -1381,17 +1349,16 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] - mova m31, [pb_4x0_4x4_4x8_4x12] - mova m30, [pb_mask] - vpbroadcastd m29, [pb_128] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] vpbroadcastd m28, [pb_m1_1] vpbroadcastd m27, [pw_2048] - %define pbshuf m31 - %define pbmask m30 - %define pb128 m29 + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 %define pbm1_1 m28 %define pw2048 m27 - %define is_uv 0 .loop: cmp word [maskq+8], 0 ; vmask[2] @@ -1411,7 +1378,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call .v4 .end: add lq, 64 @@ -1420,6 +1387,11 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ sub wd, 16 jg .loop RET +ALIGN function_align +RESET_MM_PERMUTATION +.v4: + FILTER 4, v + ret cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ lut, h, stride3, stride8 @@ -1429,11 +1401,11 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ lea stride3q, [strideq*3] lea stride8q, [strideq*8] kxnorw k6, k6, k6 - vpbroadcastd m29, strided - vpbroadcastd m30, l_strided - pmulld m31, m29, [hmulA] - pmulld m30, m30, [hmulB] - pmulld m29, m29, [hmulC] + vpbroadcastd m19, strided + vpbroadcastd m20, l_strided + pmulld m21, m19, [hmulA] + pmulld m20, [hmulB] + pmulld m19, [hmulC] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask [pb_mask] %define pb128 [pb_128]{bcstd} @@ -1457,7 +1429,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, h + call .h4 .end: lea lq, [lq+l_strideq*8] @@ -1466,9 +1438,13 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ sub hd, 16 jg .loop RET +ALIGN function_align RESET_MM_PERMUTATION +.h4: + FILTER 4, h + ret -cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ +cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \ lut, w, stride3, mstride DECLARE_REG_TMP 9 shl l_strideq, 2 @@ -1476,16 +1452,15 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] - mova m20, [pb_4x0_4x4_4x8_4x12] - mova m19, [pb_mask] - vpbroadcastd m18, [pb_128] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] vpbroadcastd m17, [pb_m1_1] vpbroadcastd m16, [pw_4096] - %define pbshuf m20 - %define pbmask m19 - %define pb128 m18 + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 %define pbm1_1 m17 - %define is_uv 1 .loop: cmp word [maskq+4], 0 ; vmask[1] @@ -1498,7 +1473,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4 .end: add lq, 64 @@ -1525,17 +1500,14 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ vpbroadcastd m19, strided vpbroadcastd m20, l_strided pmulld m21, m19, [hmulA] - pmulld m20, m20, [hmulB] - pmulld m19, m19, [hmulC] + pmulld m20, [hmulB] + pmulld m19, [hmulC] mova m18, [pb_mask] vpbroadcastd m17, [pb_128] vpbroadcastd m16, [pw_4096] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask m18 %define pb128 m17 - %xdefine m31 m21 - %xdefine m30 m20 - %xdefine m29 m19 add l_strideq, l_strideq .loop: @@ -1549,7 +1521,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, h + call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4 .end: lea lq, [lq+l_strideq*8] diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h new file mode 100644 index 00000000000..de23be8866c --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h @@ -0,0 +1,94 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/looprestoration.h" + +#include "common/intops.h" + +#define decl_wiener_filter_fns(ext) \ +decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \ +decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext)) + +#define decl_sgr_filter_fns(ext) \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext)) + +decl_wiener_filter_fns(sse2); +decl_wiener_filter_fns(ssse3); +decl_wiener_filter_fns(avx2); +decl_wiener_filter_fns(avx512icl); +decl_sgr_filter_fns(ssse3); +decl_sgr_filter_fns(avx2); +decl_sgr_filter_fns(avx512icl); + +static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; +#if BITDEPTH == 8 + c->wiener[0] = BF(dav1d_wiener_filter7, sse2); + c->wiener[1] = BF(dav1d_wiener_filter5, sse2); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); + c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3); + c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3); + } + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->wiener[0] = BF(dav1d_wiener_filter7, avx2); + c->wiener[1] = BF(dav1d_wiener_filter5, avx2); + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); + } + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl); +#if BITDEPTH == 8 + /* With VNNI we don't need a 5-tap version. */ + c->wiener[1] = c->wiener[0]; +#else + c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl); +#endif + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl); + } +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm index 5669ce66d8f..1e571774caf 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm @@ -329,11 +329,11 @@ ALIGN function_align packuswb m2, m4 psrlw m2, 8 vpackuswb m2{k2}, m3, m5 - mova [dstq+r10], m2 - add r10, 64 - jl .hv_loop - mov t6, t5 - mov t5, t4 + movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap + add r10, 64 ; function is used for chroma as well, and in some + jl .hv_loop ; esoteric edge cases chroma dst pointers may only + mov t6, t5 ; have a 32-byte alignment despite having a width + mov t5, t4 ; larger than 32, so use an unaligned store here. mov t4, t3 mov t3, t2 mov t2, t1 @@ -379,7 +379,7 @@ ALIGN function_align packuswb m0, m2 psrlw m0, 8 vpackuswb m0{k2}, m1, m3 - mova [dstq+r10], m0 + movu [dstq+r10], m0 add r10, 64 jl .v_loop mov t6, t5 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc.h b/chromium/third_party/dav1d/libdav1d/src/x86/mc.h new file mode 100644 index 00000000000..65c607e180c --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc.h @@ -0,0 +1,299 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018-2021, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/mc.h" + +#define decl_fn(type, name) \ + decl_##type##_fn(BF(name, sse2)); \ + decl_##type##_fn(BF(name, ssse3)); \ + decl_##type##_fn(BF(name, avx2)); \ + decl_##type##_fn(BF(name, avx512icl)); +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = BF(dav1d_put_##name, suffix) +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = BF(dav1d_prep_##name, suffix) +#define init_mc_scaled_fn(type, name, suffix) \ + c->mc_scaled[type] = BF(dav1d_put_##name, suffix) +#define init_mct_scaled_fn(type, name, suffix) \ + c->mct_scaled[type] = BF(dav1d_prep_##name, suffix) + +decl_fn(mc, dav1d_put_8tap_regular); +decl_fn(mc, dav1d_put_8tap_regular_smooth); +decl_fn(mc, dav1d_put_8tap_regular_sharp); +decl_fn(mc, dav1d_put_8tap_smooth); +decl_fn(mc, dav1d_put_8tap_smooth_regular); +decl_fn(mc, dav1d_put_8tap_smooth_sharp); +decl_fn(mc, dav1d_put_8tap_sharp); +decl_fn(mc, dav1d_put_8tap_sharp_regular); +decl_fn(mc, dav1d_put_8tap_sharp_smooth); +decl_fn(mc, dav1d_put_bilin); + +decl_fn(mct, dav1d_prep_8tap_regular); +decl_fn(mct, dav1d_prep_8tap_regular_smooth); +decl_fn(mct, dav1d_prep_8tap_regular_sharp); +decl_fn(mct, dav1d_prep_8tap_smooth); +decl_fn(mct, dav1d_prep_8tap_smooth_regular); +decl_fn(mct, dav1d_prep_8tap_smooth_sharp); +decl_fn(mct, dav1d_prep_8tap_sharp); +decl_fn(mct, dav1d_prep_8tap_sharp_regular); +decl_fn(mct, dav1d_prep_8tap_sharp_smooth); +decl_fn(mct, dav1d_prep_bilin); + +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth); +decl_fn(mc_scaled, dav1d_put_bilin_scaled); + +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth); +decl_fn(mct_scaled, dav1d_prep_bilin_scaled); + +decl_fn(avg, dav1d_avg); +decl_fn(w_avg, dav1d_w_avg); +decl_fn(mask, dav1d_mask); +decl_fn(w_mask, dav1d_w_mask_420); +decl_fn(w_mask, dav1d_w_mask_422); +decl_fn(w_mask, dav1d_w_mask_444); +decl_fn(blend, dav1d_blend); +decl_fn(blend_dir, dav1d_blend_v); +decl_fn(blend_dir, dav1d_blend_h); + +decl_fn(warp8x8, dav1d_warp_affine_8x8); +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4)); +decl_fn(warp8x8t, dav1d_warp_affine_8x8t); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4)); + +decl_fn(emu_edge, dav1d_emu_edge); + +decl_fn(resize, dav1d_resize); + +static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) + return; + +#if BITDEPTH == 8 + init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2); + + c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2); +#endif + + if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) + return; + + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); + + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); + + c->avg = BF(dav1d_avg, ssse3); + c->w_avg = BF(dav1d_w_avg, ssse3); + c->mask = BF(dav1d_mask, ssse3); + c->w_mask[0] = BF(dav1d_w_mask_444, ssse3); + c->w_mask[1] = BF(dav1d_w_mask_422, ssse3); + c->w_mask[2] = BF(dav1d_w_mask_420, ssse3); + c->blend = BF(dav1d_blend, ssse3); + c->blend_v = BF(dav1d_blend_v, ssse3); + c->blend_h = BF(dav1d_blend_h, ssse3); + c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3); + c->emu_edge = BF(dav1d_emu_edge, ssse3); + c->resize = BF(dav1d_resize, ssse3); + + if(!(flags & DAV1D_X86_CPU_FLAG_SSE41)) + return; + +#if BITDEPTH == 8 + c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4); +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) + return; + + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + c->avg = BF(dav1d_avg, avx2); + c->w_avg = BF(dav1d_w_avg, avx2); + c->mask = BF(dav1d_mask, avx2); + c->w_mask[0] = BF(dav1d_w_mask_444, avx2); + c->w_mask[1] = BF(dav1d_w_mask_422, avx2); + c->w_mask[2] = BF(dav1d_w_mask_420, avx2); + c->blend = BF(dav1d_blend, avx2); + c->blend_v = BF(dav1d_blend_v, avx2); + c->blend_h = BF(dav1d_blend_h, avx2); + c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2); + c->emu_edge = BF(dav1d_emu_edge, avx2); + c->resize = BF(dav1d_resize, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) + return; + + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl); + init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl); + + c->avg = BF(dav1d_avg, avx512icl); + c->w_avg = BF(dav1d_w_avg, avx512icl); + c->mask = BF(dav1d_mask, avx512icl); + c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl); + c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl); + c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl); + c->blend = BF(dav1d_blend, avx512icl); + c->blend_v = BF(dav1d_blend_v, avx512icl); + c->blend_h = BF(dav1d_blend_h, avx512icl); + c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl); + c->resize = BF(dav1d_resize, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm index e83b18ad969..585ba53e080 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm @@ -1604,7 +1604,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my vpbroadcastd m11, [buf+ 4] vpbroadcastd m12, [buf+ 8] vpbroadcastd m13, [buf+12] - cmp wd, 16 + sub wd, 16 je .h_w16 jg .h_w32 .h_w8: @@ -3615,32 +3615,32 @@ ALIGN function_align .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq ], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq ], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm0, ym1, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + vextracti32x4 xm0, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq ], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: @@ -3860,33 +3860,33 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb m3, m15, m3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 mova [maskq], xm3 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8: @@ -4090,32 +4090,32 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: @@ -4249,32 +4249,32 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm index eb3ca1c427d..7897f1decc1 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm @@ -449,9 +449,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy pshufb ym0, ym4 pmaddubsw ym0, ym5 pmulhrsw ym0, ym3 - vpmovuswb xmm0, ym0 - movq [dstq+dsq*0], xmm0 - movhps [dstq+dsq*1], xmm0 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 @@ -755,9 +755,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy pmulhw ym1, ym6 paddw ym1, ym2 pmulhrsw ym1, ym7 - vpmovuswb xmm1, ym1 - movq [dstq+dsq*0], xmm1 - movhps [dstq+dsq*1], xmm1 + vpmovuswb xm1, ym1 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop @@ -1588,13 +1588,13 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_w4_loop RET .h_w8: - movu xmm0, [srcq+ssq*0] - vinserti32x4 ym0, ymm0, [srcq+ssq*1], 1 + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 - vpmovuswb xmm0, ym0 - movq [dstq+dsq*0], xmm0 - movhps [dstq+dsq*1], xmm0 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 @@ -3308,17 +3308,17 @@ ALIGN function_align cmp hd, 8 jg .w4_h16 WRAP_YMM %1 0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_ret lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_ret: RET .w4_h16: @@ -3332,29 +3332,29 @@ ALIGN function_align cmp hd, 4 jne .w8_h8 WRAP_YMM %1 0 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq ], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: %1_INC_PTR 2 lea dstq, [dstq+strideq*4] .w8_h8: %1 0 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq ], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq ], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET @@ -3415,8 +3415,8 @@ ALIGN function_align paddw m0, [tmp2q+(%1+0)*mmsize] mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] - pmulhrsw m0, m2 - pmulhrsw m1, m2 + pmulhrsw m0, m4 + pmulhrsw m1, m4 packuswb m0, m1 %endmacro @@ -3425,13 +3425,13 @@ ALIGN function_align add tmp2q, %1*mmsize %endmacro -cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] - vpbroadcastd m2, [base+pw_1024] + vpbroadcastd m4, [base+pw_1024] add wq, r6 BIDIR_FN AVG @@ -3573,17 +3573,17 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vinserti128 ym5, [wm_420_perm4+32], 1 vpermb ym4, ym5, ym4 vpdpbusd ym8, ym4, ym9 - vextracti128 xmm1, m0, 1 + vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_end: vpermb ym8, ym10, ym8 movq [maskq], xm8 @@ -3609,11 +3609,11 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpdpbusd ym8, ym4, ym9 vpermb m8, m10, m8 mova [maskq], xm8 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 @@ -3627,18 +3627,18 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpdpbusd m1, m4, m9 vpermb m1, m10, m1 mova [maskq], xm1 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET @@ -3766,17 +3766,17 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 movhps xm10, [wm_422_mask+16] vpdpwssd ym8, ym4, ym9 vpermb ym8, ym10, ym8 - vextracti128 xmm1, m0, 1 + vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_end: pand xm8, xm11 mova [maskq], xm8 @@ -3801,11 +3801,11 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb ym8, ym10, ym8 pand xm8, xm11 mova [maskq], xm8 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 @@ -3819,18 +3819,18 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb m1, m10, m1 pand ym1, ym11 mova [maskq], ym1 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET @@ -3936,17 +3936,17 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 - vextracti128 xmm1, m0, 1 + vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_end: RET .w4_h16: @@ -3965,11 +3965,11 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 @@ -3980,18 +3980,18 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 mova [maskq], m4 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac.h b/chromium/third_party/dav1d/libdav1d/src/x86/msac.h index e11cd08c8a4..0bb632fb314 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/msac.h +++ b/chromium/third_party/dav1d/libdav1d/src/x86/msac.h @@ -28,21 +28,21 @@ #ifndef DAV1D_SRC_X86_MSAC_H #define DAV1D_SRC_X86_MSAC_H +#include "src/cpu.h" + unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf); unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s); unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f); unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf); -/* Needed for checkasm */ -unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, - size_t n_symbols); - #if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2 #define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2 @@ -55,10 +55,21 @@ unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, #if ARCH_X86_64 #define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb)) + +static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (flags & DAV1D_X86_CPU_FLAG_SSE2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; + } + + if (flags & DAV1D_X86_CPU_FLAG_AVX2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; + } +} + #elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2 #endif -void dav1d_msac_init_x86(MsacContext *const s); - #endif /* DAV1D_SRC_X86_MSAC_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c b/chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h index e3575ba4da7..de4124c436e 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c +++ b/chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h @@ -32,7 +32,7 @@ decl_splat_mv_fn(dav1d_splat_mv_sse2); decl_splat_mv_fn(dav1d_splat_mv_avx2); decl_splat_mv_fn(dav1d_splat_mv_avx512icl); -COLD void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { +static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; |