summaryrefslogtreecommitdiff
path: root/chromium/third_party/dav1d/libdav1d/src
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src')
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S239
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S24
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S320
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S39
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S39
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/asm.S6
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/cdef.h88
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h204
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/ipred.h80
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/itx.h141
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h (renamed from chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c)26
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h265
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/mc.h114
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h (renamed from chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c)2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/cdef.h3
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/cdef_tmpl.c16
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/cdf.c5
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/cpu.c19
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/cpu.h57
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/data.c6
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/decode.c95
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/dequant_tables.h2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/fg_apply_tmpl.c7
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/filmgrain.h4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/filmgrain_tmpl.c18
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/getbits.c82
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/getbits.h8
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/internal.h15
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/ipred.h2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/ipred_tmpl.c12
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/itx.h2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c12
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/lf_mask.c53
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/lib.c89
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/loopfilter.h2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/loopfilter_tmpl.c12
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/looprestoration.h3
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/looprestoration_tmpl.c16
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/mc.h2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/mc_tmpl.c12
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/meson.build31
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/msac.c2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/obu.c389
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/picture.c33
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/picture.h4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/ppc/cdef.h61
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/ppc/cdef_tmpl.c487
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration.h48
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration_tmpl.c321
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/qm.h2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c36
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/ref.c6
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/ref.h6
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/refmvs.c12
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/scan.h2
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/tables.h60
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/thread_task.c285
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/wedge.h4
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/cdef.h87
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm622
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h81
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/ipred.h146
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm66
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm92
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx.h356
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm1482
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm2599
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm231
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm84
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm298
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h66
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm172
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm912
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm16
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm718
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h94
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm12
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/mc.h299
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm146
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm166
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/msac.h23
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h (renamed from chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c)2
82 files changed, 10542 insertions, 2158 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S
index db8ecffe6ea..aa6c272e718 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/32/itx16.S
@@ -668,12 +668,21 @@ def_fn_4x4 identity, flipadst
.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
idct_4s_x4 \r0, \r2, \r4, \r6
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a
- vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a
+ vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a
vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a
vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a
vrshr.s32 \r1, q2, #12 // t4a
- vrshr.s32 \r7, q4, #12 // t7a
+ vrshr.s32 \r7, q3, #12 // t7a
vrshr.s32 \r3, q6, #12 // t5a
vrshr.s32 \r5, q7, #12 // t6a
@@ -682,17 +691,24 @@ def_fn_4x4 identity, flipadst
vqadd.s32 q3, \r7, \r5 // t7
vqsub.s32 \r3, \r7, \r5 // t6a
- vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5
+.irp r, q2, \r1, q3, \r3
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q2, \r1, q3, \r3
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5
vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6
- vrshr.s32 q4, q4, #12 // t5
+ vrshr.s32 q7, q7, #12 // t5
vrshr.s32 q5, q6, #12 // t6
vqsub.s32 \r7, \r0, q3 // out7
vqadd.s32 \r0, \r0, q3 // out0
vqadd.s32 \r1, \r2, q5 // out1
vqsub.s32 q6, \r2, q5 // out6
- vqadd.s32 \r2, \r4, q4 // out2
- vqsub.s32 \r5, \r4, q4 // out5
+ vqadd.s32 \r2, \r4, q7 // out2
+ vqsub.s32 \r5, \r4, q7 // out5
vqadd.s32 \r3, \r6, q2 // out3
vqsub.s32 \r4, \r6, q2 // out4
vmov \r6, q6 // out6
@@ -701,6 +717,15 @@ def_fn_4x4 identity, flipadst
.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7
idct_2s_x4 \r0, \r2, \r4, \r6
+ vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a
vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a
vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a
@@ -715,6 +740,13 @@ def_fn_4x4 identity, flipadst
vqadd.s32 d5, \r7, \r5 // t7
vqsub.s32 \r3, \r7, \r5 // t6a
+.irp r, d4, \r1, d5, \r3
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, \r1, d5, \r3
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5
vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6
vrshr.s32 d6, d6, #12 // t5
@@ -763,19 +795,28 @@ endfunc
vqadd.s32 q2, q8, q12 // t0
vqsub.s32 q3, q8, q12 // t4
+ vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vqadd.s32 q4, q15, q11 // t1
vqsub.s32 q5, q15, q11 // t5
vqadd.s32 q6, q10, q14 // t2
vqsub.s32 q7, q10, q14 // t6
+ vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqadd.s32 q10, q13, q9 // t3
vqsub.s32 q11, q13, q9 // t7
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmax.s32 \r, \r, q14
+.endr
+
vmul_vmla q8, q3, q5, d1[1], d1[0]
- vmul_vmls q12, q3, q5, d1[0], d1[1]
+ vmul_vmls q13, q3, q5, d1[0], d1[1]
vmul_vmls q14, q11, q7, d1[1], d1[0]
vrshr.s32 q3, q8, #12 // t4a
- vrshr.s32 q5, q12, #12 // t5a
+ vrshr.s32 q5, q13, #12 // t5a
vmul_vmla q8, q11, q7, d1[0], d1[1]
@@ -786,12 +827,24 @@ endfunc
vqsub.s32 q2, q2, q6 // t2
vqadd.s32 \r7, q4, q10 // out7
vqsub.s32 q4, q4, q10 // t3
- vqneg.s32 \r7, \r7 // out7
+
+ vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqadd.s32 \r1, q3, q7 // out1
vqsub.s32 q3, q3, q7 // t6
vqadd.s32 \r6, q5, q11 // out6
vqsub.s32 q5, q5, q11 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, q2, q4, q3, q5
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q4, q3, q5
+ vmax.s32 \r, \r, q10
+.endr
+
+ vqneg.s32 \r7, \r7 // out7
vqneg.s32 \r1, \r1 // out1
vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12)
@@ -1068,6 +1121,14 @@ function inv_dct_2s_x16_neon
idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30
+ // idct_8 leaves the row_clip_max/min constants in d9 and d8
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmax.s32 \r, \r, d8
+.endr
+
vld1.32 {q0, q1}, [r12, :128]
sub r12, r12, #32
@@ -1099,6 +1160,13 @@ function inv_dct_2s_x16_neon
vqadd.s32 d25, d29, d27 // t12
vqsub.s32 d29, d29, d27 // t13
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a
vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a
vrshr.s32 d21, d6, #12 // t9a
@@ -1119,6 +1187,13 @@ function inv_dct_2s_x16_neon
vqsub.s32 d25, d27, d29 // t13
vqadd.s32 d27, d27, d29 // t14
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11
vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12
vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a
@@ -1193,6 +1268,9 @@ endfunc
vld1.32 {q0, q1}, [r12, :128]
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
vqsub.s32 d5, d16, d23 // t8a
vqadd.s32 d16, d16, d23 // t0a
vqsub.s32 d7, d31, d24 // t9a
@@ -1210,6 +1288,13 @@ endfunc
vqadd.s32 d28, d25, d30 // t7a
vqsub.s32 d25, d25, d30 // t15a
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8
vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9
vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10
@@ -1244,6 +1329,13 @@ endfunc
vqadd.s32 d20, d29, d22 // t11a
vqsub.s32 d29, d29, d22 // t15a
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a
vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a
vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a
@@ -1272,24 +1364,34 @@ endfunc
vqadd.s32 \o15,d31, d26 // out15
vmov \o0, d4
.endif
- vqneg.s32 \o15, \o15 // out15
vqsub.s32 d3, d29, d18 // t15a
vqadd.s32 \o13,d29, d18 // out13
vqadd.s32 \o2, d17, d30 // out2
vqsub.s32 d26, d17, d30 // t14a
- vqneg.s32 \o13,\o13 // out13
vqadd.s32 \o1, d19, d27 // out1
vqsub.s32 d27, d19, d27 // t10
vqadd.s32 \o14,d28, d20 // out14
vqsub.s32 d20, d28, d20 // t11
- vqneg.s32 \o1, \o1 // out1
vqadd.s32 \o3, d22, d24 // out3
vqsub.s32 d22, d22, d24 // t6
vqadd.s32 \o12,d25, d23 // out12
vqsub.s32 d23, d25, d23 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmax.s32 \r, \r, d10
+.endr
+
+ vqneg.s32 \o15, \o15 // out15
+ vqneg.s32 \o13,\o13 // out13
+ vqneg.s32 \o1, \o1 // out1
vqneg.s32 \o3, \o3 // out3
vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
@@ -1947,6 +2049,9 @@ function inv_dct32_odd_2s_x16_neon
vld1.32 {q0, q1}, [r12, :128]
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
vqsub.s32 d5, d16, d24 // t17
vqadd.s32 d16, d16, d24 // t16
vqsub.s32 d7, d31, d23 // t30
@@ -1964,6 +2069,13 @@ function inv_dct32_odd_2s_x16_neon
vqadd.s32 d25, d19, d27 // t28
vqsub.s32 d19, d19, d27 // t29
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a
vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a
vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a
@@ -2000,6 +2112,13 @@ function inv_dct32_odd_2s_x16_neon
vqsub.s32 d29, d31, d25 // t28a
vqadd.s32 d31, d31, d25 // t31a
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a
vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a
vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19
@@ -2037,6 +2156,13 @@ function inv_dct32_odd_2s_x16_neon
vqsub.s32 d24, d24, d19 // t27a
vmov d19, d4 // out19
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20
vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27
vrshr.s32 d20, d4, #12 // t20
@@ -2081,6 +2207,18 @@ function inv_txfm_horz\suffix\()_dct_32x2_neon
scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
.endif
bl inv_dct_2s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
vtrn.32 d16, d17
vtrn.32 d18, d19
vtrn.32 d20, d21
@@ -2745,14 +2883,21 @@ function inv_dct64_step1_neon
vqsub.s32 d30, d23, d22 // t62
vqadd.s32 d31, d23, d22 // t63
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a
vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a
vneg.s32 d4, d4 // t34a
- vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a
+ vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a
vrshr.s32 d26, d4, #12 // t34a
vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a
vrshr.s32 d29, d6, #12 // t61a
- vrshr.s32 d25, d8, #12 // t33a
+ vrshr.s32 d25, d7, #12 // t33a
vrshr.s32 d30, d4, #12 // t62a
vqadd.s32 d16, d24, d27 // t32a
@@ -2764,13 +2909,20 @@ function inv_dct64_step1_neon
vqsub.s32 d21, d30, d29 // t61
vqadd.s32 d22, d30, d29 // t62
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a
vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a
- vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60
+ vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60
vrshr.s32 d21, d4, #12 // t61a
vrshr.s32 d18, d6, #12 // t34a
vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35
- vrshr.s32 d20, d8, #12 // t60
+ vrshr.s32 d20, d7, #12 // t60
vrshr.s32 d19, d4, #12 // t35
vst1.32 {d16, d17, d18, d19}, [r6, :128]!
@@ -2805,14 +2957,21 @@ function inv_dct64_step2_neon
vqadd.s32 d30, d23, d22 // t48
vqsub.s32 d31, d23, d22 // t55
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a
vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a
- vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a
+ vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a
vrshr.s32 d25, d4, #12 // t56a
vrshr.s32 d27, d6, #12 // t39a
- vneg.s32 d8, d8 // t40a
+ vneg.s32 d7, d7 // t40a
vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a
- vrshr.s32 d31, d8, #12 // t40a
+ vrshr.s32 d31, d7, #12 // t40a
vrshr.s32 d28, d4, #12 // t55a
vqadd.s32 d16, d24, d29 // t32a
@@ -2824,13 +2983,20 @@ function inv_dct64_step2_neon
vqsub.s32 d21, d25, d28 // t55
vqadd.s32 d22, d25, d28 // t56
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a
vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a
- vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47
+ vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47
vrshr.s32 d18, d4, #12 // t40a
vrshr.s32 d21, d6, #12 // t55a
vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48
- vrshr.s32 d19, d8, #12 // t47
+ vrshr.s32 d19, d7, #12 // t47
vrshr.s32 d20, d4, #12 // t48
vstr d16, [r6, #4*2*0] // t32a
@@ -2916,6 +3082,17 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
bl inv_dct_2s_x16_neon
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
store16 r6
movdup_if d0, r12, 2896*8*(1<<16), \scale
@@ -2934,6 +3111,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
mov r9, #-8
+ vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
.macro store_addsub r0, r1, r2, r3
vld1.32 {d2}, [r6, :64]!
vld1.32 {d3}, [r6, :64]!
@@ -2942,16 +3121,32 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
vld1.32 {d4}, [r6, :64]!
vqadd.s32 d7, d3, \r1
vqsub.s32 \r1, d3, \r1
+ vmin.s32 d6, d6, d1
+ vmin.s32 \r0, \r0, d1
vld1.32 {d5}, [r6, :64]!
vqadd.s32 d2, d4, \r2
sub r6, r6, #8*4
+ vmax.s32 d6, d6, d0
+ vmax.s32 \r0, \r0, d0
vqsub.s32 \r2, d4, \r2
+ vmin.s32 d7, d7, d1
+ vmin.s32 \r1, \r1, d1
vst1.32 {d6}, [r6, :64]!
vst1.32 {\r0}, [r10, :64], r9
+ vmin.s32 d2, d2, d1
+ vmin.s32 \r2, \r2, d1
+ vmax.s32 d7, d7, d0
+ vmax.s32 \r1, \r1, d0
vqadd.s32 d3, d5, \r3
vqsub.s32 \r3, d5, \r3
+ vmax.s32 d2, d2, d0
+ vmax.s32 \r2, \r2, d0
+ vmin.s32 d3, d3, d1
+ vmin.s32 \r3, \r3, d1
vst1.32 {d7}, [r6, :64]!
vst1.32 {\r1}, [r10, :64], r9
+ vmax.s32 d3, d3, d0
+ vmax.s32 \r3, \r3, d0
vst1.32 {d2}, [r6, :64]!
vst1.32 {\r2}, [r10, :64], r9
vst1.32 {d3}, [r6, :64]!
@@ -2966,6 +3161,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
add r6, r6, #2*4*16
movrel_local r12, idct64_coeffs
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
movdup_if d0, lr, 2896*8*(1<<16), \scale
vmov_if d7, #0, \clear
add r9, r7, r8, lsl #4 // offset 16
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S
index c9650e9d544..b1b2f8fe659 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx.S
@@ -483,10 +483,10 @@ endfunc
add \o1\().4s, v5.4s, v7.4s
sub \o3\().4s, \o3\().4s, v7.4s
- rshrn \o0\().4h, \o0\().4s, #12
- rshrn \o2\().4h, \o2\().4s, #12
- rshrn \o1\().4h, \o1\().4s, #12
- rshrn \o3\().4h, \o3\().4s, #12
+ sqrshrn \o0\().4h, \o0\().4s, #12
+ sqrshrn \o2\().4h, \o2\().4s, #12
+ sqrshrn \o1\().4h, \o1\().4s, #12
+ sqrshrn \o3\().4h, \o3\().4s, #12
.endm
function inv_adst_4h_x4_neon, export=1
@@ -538,21 +538,21 @@ endfunc
sub v4.4s, v4.4s, v2.4s // out3
sub v5.4s, v5.4s, v3.4s
- rshrn v18.4h, v18.4s, #12
- rshrn2 v18.8h, v19.4s, #12
+ sqrshrn v18.4h, v18.4s, #12
+ sqrshrn2 v18.8h, v19.4s, #12
- rshrn \o0\().4h, v16.4s, #12
- rshrn2 \o0\().8h, v17.4s, #12
+ sqrshrn \o0\().4h, v16.4s, #12
+ sqrshrn2 \o0\().8h, v17.4s, #12
.ifc \o2, v17
mov v17.16b, v18.16b
.endif
- rshrn \o1\().4h, v6.4s, #12
- rshrn2 \o1\().8h, v7.4s, #12
+ sqrshrn \o1\().4h, v6.4s, #12
+ sqrshrn2 \o1\().8h, v7.4s, #12
- rshrn \o3\().4h, v4.4s, #12
- rshrn2 \o3\().8h, v5.4s, #12
+ sqrshrn \o3\().4h, v4.4s, #12
+ sqrshrn2 \o3\().8h, v5.4s, #12
.endm
function inv_adst_8h_x4_neon, export=1
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
index 0a0c7768b13..eee3a9636de 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
@@ -124,6 +124,13 @@ endconst
.endif
.endm
+.macro smin_4s r0, r1, r2
+ smin \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+.macro smax_4s r0, r1, r2
+ smax \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+
.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
@@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst
.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
idct_4 \r0, \r2, \r4, \r6
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ smin_4s \r, \r, v5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ smax_4s \r, \r, v4
+.endr
+
mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
- mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
+ mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
srshr \r1\().4s, v2.4s, #12 // t4a
- srshr \r7\().4s, v4.4s, #12 // t7a
+ srshr \r7\().4s, v3.4s, #12 // t7a
srshr \r3\().4s, v6.4s, #12 // t5a
srshr \r5\().4s, v7.4s, #12 // t6a
@@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst
sqadd v3.4s, \r7\().4s, \r5\().4s // t7
sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
- mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5
+.irp r, v2, \r1, v3, \r3
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, \r1, v3, \r3
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5
mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
- srshr v4.4s, v4.4s, #12 // t5
- srshr v5.4s, v6.4s, #12 // t6
+ srshr v7.4s, v7.4s, #12 // t5
+ srshr v6.4s, v6.4s, #12 // t6
sqsub \r7\().4s, \r0\().4s, v3.4s // out7
sqadd \r0\().4s, \r0\().4s, v3.4s // out0
- sqadd \r1\().4s, \r2\().4s, v5.4s // out1
- sqsub v6.4s, \r2\().4s, v5.4s // out6
- sqadd \r2\().4s, \r4\().4s, v4.4s // out2
- sqsub \r5\().4s, \r4\().4s, v4.4s // out5
+ sqadd \r1\().4s, \r2\().4s, v6.4s // out1
+ sqsub v6.4s, \r2\().4s, v6.4s // out6
+ sqadd \r2\().4s, \r4\().4s, v7.4s // out2
+ sqsub \r5\().4s, \r4\().4s, v7.4s // out5
sqadd \r3\().4s, \r6\().4s, v2.4s // out3
sqsub \r4\().4s, \r6\().4s, v2.4s // out4
mov \r6\().16b, v6.16b // out6
@@ -660,8 +683,11 @@ endfunc
ld1 {v0.4s}, [x16]
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+
sqadd v2.4s, v16.4s, v20.4s // t0
sqsub v3.4s, v16.4s, v20.4s // t4
+ mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd v4.4s, v23.4s, v19.4s // t1
sqsub v5.4s, v23.4s, v19.4s // t5
sqadd v6.4s, v18.4s, v22.4s // t2
@@ -669,6 +695,13 @@ endfunc
sqadd v18.4s, v21.4s, v17.4s // t3
sqsub v19.4s, v21.4s, v17.4s // t7
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smax_4s \r, \r, v20
+.endr
+
mul_mla v16, v3, v5, v0.s[3], v0.s[2]
mul_mls v20, v3, v5, v0.s[2], v0.s[3]
mul_mls v22, v19, v7, v0.s[3], v0.s[2]
@@ -685,12 +718,24 @@ endfunc
sqsub v2.4s, v2.4s, v6.4s // t2
sqadd \o7\().4s, v4.4s, v18.4s // out7
sqsub v4.4s, v4.4s, v18.4s // t3
- sqneg \o7\().4s, \o7\().4s // out7
+
+ mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd \o1\().4s, v3.4s, v7.4s // out1
sqsub v3.4s, v3.4s, v7.4s // t6
sqadd \o6\().4s, v5.4s, v19.4s // out6
sqsub v5.4s, v5.4s, v19.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v4, v3, v5
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v4, v3, v5
+ smax_4s \r, \r, v18
+.endr
+
+ sqneg \o7\().4s, \o7\().4s // out7
sqneg \o1\().4s, \o1\().4s // out1
mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
@@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon
idct_8 v16, v18, v20, v22, v24, v26, v28, v30
+ // idct_8 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
ld1 {v0.4s, v1.4s}, [x16]
sub x16, x16, #32
mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
- mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a
+ mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a
mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
srshr v17.4s, v2.4s, #12 // t8a
- srshr v31.4s, v4.4s, #12 // t15a
+ srshr v31.4s, v3.4s, #12 // t15a
mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
- mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a
+ mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a
srshr v23.4s, v6.4s, #12 // t9a
srshr v25.4s, v2.4s, #12 // t14a
mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
- srshr v21.4s, v4.4s, #12 // t10a
+ srshr v21.4s, v3.4s, #12 // t10a
srshr v27.4s, v6.4s, #12 // t13a
- mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a
+ mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a
srshr v19.4s, v2.4s, #12 // t11a
- srshr v29.4s, v4.4s, #12 // t12a
+ srshr v29.4s, v3.4s, #12 // t12a
ld1 {v0.4s}, [x16]
@@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon
sqadd v25.4s, v29.4s, v27.4s // t12
sqsub v29.4s, v29.4s, v27.4s // t13
- mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a
mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
- srshr v21.4s, v4.4s, #12 // t9a
+ srshr v21.4s, v7.4s, #12 // t9a
srshr v27.4s, v6.4s, #12 // t14a
- mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a
+ mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a
mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
- srshr v29.4s, v4.4s, #12 // t13a
+ srshr v29.4s, v7.4s, #12 // t13a
neg v6.4s, v6.4s
srshr v23.4s, v6.4s, #12 // t10a
@@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon
sqsub v25.4s, v27.4s, v29.4s // t13
sqadd v27.4s, v27.4s, v29.4s // t14
- mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
- srshr v4.4s, v4.4s, #12 // t11
- srshr v5.4s, v6.4s, #12 // t12
- mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a
+ srshr v7.4s, v7.4s, #12 // t11
+ srshr v6.4s, v6.4s, #12 // t12
+ mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a
srshr v2.4s, v2.4s, #12 // t10a
- srshr v3.4s, v6.4s, #12 // t13a
+ srshr v3.4s, v3.4s, #12 // t13a
- sqadd v6.4s, v16.4s, v31.4s // out0
+ sqadd v1.4s, v16.4s, v31.4s // out0
sqsub v31.4s, v16.4s, v31.4s // out15
- mov v16.16b, v6.16b
+ mov v16.16b, v1.16b
sqadd v23.4s, v30.4s, v17.4s // out7
- sqsub v7.4s, v30.4s, v17.4s // out8
+ sqsub v1.4s, v30.4s, v17.4s // out8
sqadd v17.4s, v18.4s, v27.4s // out1
sqsub v30.4s, v18.4s, v27.4s // out14
sqadd v18.4s, v20.4s, v3.4s // out2
sqsub v29.4s, v20.4s, v3.4s // out13
sqadd v3.4s, v28.4s, v19.4s // out6
sqsub v25.4s, v28.4s, v19.4s // out9
- sqadd v19.4s, v22.4s, v5.4s // out3
- sqsub v28.4s, v22.4s, v5.4s // out12
- sqadd v20.4s, v24.4s, v4.4s // out4
- sqsub v27.4s, v24.4s, v4.4s // out11
+ sqadd v19.4s, v22.4s, v6.4s // out3
+ sqsub v28.4s, v22.4s, v6.4s // out12
+ sqadd v20.4s, v24.4s, v7.4s // out4
+ sqsub v27.4s, v24.4s, v7.4s // out11
sqadd v21.4s, v26.4s, v2.4s // out5
sqsub v26.4s, v26.4s, v2.4s // out10
- mov v24.16b, v7.16b
+ mov v24.16b, v1.16b
mov v22.16b, v3.16b
ret
@@ -1084,6 +1151,9 @@ endfunc
ld1 {v0.4s, v1.4s}, [x16]
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
sqsub v2.4s, v16.4s, v23.4s // t8a
sqadd v16.4s, v16.4s, v23.4s // t0a
sqsub v3.4s, v31.4s, v24.4s // t9a
@@ -1101,6 +1171,13 @@ endfunc
sqadd v28.4s, v25.4s, v30.4s // t7a
sqsub v25.4s, v25.4s, v30.4s // t15a
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smax_4s \r, \r, v7
+.endr
+
mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
@@ -1135,6 +1212,13 @@ endfunc
sqadd v20.4s, v29.4s, v22.4s // t11a
sqsub v29.4s, v29.4s, v22.4s // t15a
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smax_4s \r, \r, v7
+.endr
+
mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
@@ -1163,24 +1247,34 @@ endfunc
sqadd \o15\().4s, v31.4s, v26.4s // out15
mov \o0\().16b, v4.16b
.endif
- sqneg \o15\().4s, \o15\().4s // out15
sqsub v3.4s, v29.4s, v18.4s // t15a
sqadd \o13\().4s, v29.4s, v18.4s // out13
sqadd \o2\().4s, v17.4s, v30.4s // out2
sqsub v26.4s, v17.4s, v30.4s // t14a
- sqneg \o13\().4s, \o13\().4s // out13
sqadd \o1\().4s, v19.4s, v27.4s // out1
sqsub v27.4s, v19.4s, v27.4s // t10
sqadd \o14\().4s, v28.4s, v20.4s // out14
sqsub v20.4s, v28.4s, v20.4s // t11
- sqneg \o1\().4s, \o1\().4s // out1
sqadd \o3\().4s, v22.4s, v24.4s // out3
sqsub v22.4s, v22.4s, v24.4s // t6
sqadd \o12\().4s, v25.4s, v23.4s // out12
sqsub v23.4s, v25.4s, v23.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smax_4s \r, \r, v7
+.endr
+
+ sqneg \o15\().4s, \o15\().4s // out15
+ sqneg \o13\().4s, \o13\().4s // out13
+ sqneg \o1\().4s, \o1\().4s // out1
sqneg \o3\().4s, \o3\().4s // out3
mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
@@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon
ld1 {v0.4s, v1.4s}, [x16]
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
sqsub v2.4s, v16.4s, v24.4s // t17
sqadd v16.4s, v16.4s, v24.4s // t16
sqsub v3.4s, v31.4s, v23.4s // t30
@@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon
sqadd v25.4s, v19.4s, v27.4s // t28
sqsub v19.4s, v19.4s, v27.4s // t29
- mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a
mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
- srshr v21.4s, v4.4s, #12 // t17a
+ srshr v21.4s, v7.4s, #12 // t17a
srshr v27.4s, v6.4s, #12 // t30a
neg v2.4s, v2.4s // -> t18a
- mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a
+ mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a
mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
srshr v19.4s, v2.4s, #12 // t18a
- srshr v24.4s, v4.4s, #12 // t29a
+ srshr v24.4s, v7.4s, #12 // t29a
mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
- mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a
+ mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a
srshr v22.4s, v6.4s, #12 // t21a
srshr v18.4s, v2.4s, #12 // t26a
- neg v4.4s, v4.4s // -> t22a
+ neg v7.4s, v7.4s // -> t22a
mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
- srshr v17.4s, v4.4s, #12 // t22a
+ srshr v17.4s, v7.4s, #12 // t22a
srshr v20.4s, v6.4s, #12 // t25a
sqsub v2.4s, v27.4s, v24.4s // t29
@@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon
sqsub v29.4s, v31.4s, v25.4s // t28a
sqadd v31.4s, v31.4s, v25.4s // t31a
- mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a
mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
- srshr v18.4s, v4.4s, #12 // t18a
+ srshr v18.4s, v7.4s, #12 // t18a
srshr v25.4s, v6.4s, #12 // t29a
- mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28
+ mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28
mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
srshr v29.4s, v2.4s, #12 // t19
- srshr v24.4s, v4.4s, #12 // t28
+ srshr v24.4s, v7.4s, #12 // t28
neg v6.4s, v6.4s // -> t20
mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
- mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a
+ mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a
srshr v26.4s, v6.4s, #12 // t20
srshr v19.4s, v2.4s, #12 // t27
- neg v4.4s, v4.4s // -> t21a
+ neg v7.4s, v7.4s // -> t21a
mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
- srshr v20.4s, v4.4s, #12 // t21a
+ srshr v20.4s, v7.4s, #12 // t21a
srshr v28.4s, v6.4s, #12 // t26a
sqsub v2.4s, v16.4s, v30.4s // t23
@@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon
sqsub v21.4s, v27.4s, v22.4s // t25a
sqsub v27.4s, v18.4s, v20.4s // t21
sqadd v18.4s, v18.4s, v20.4s // t18 = out18
- sqadd v4.4s, v29.4s, v26.4s // t19a = out19
+ sqadd v7.4s, v29.4s, v26.4s // t19a = out19
sqsub v26.4s, v29.4s, v26.4s // t20a
sqadd v29.4s, v25.4s, v28.4s // t29 = out29
sqsub v25.4s, v25.4s, v28.4s // t26
sqadd v28.4s, v24.4s, v19.4s // t28a = out28
sqsub v24.4s, v24.4s, v19.4s // t27a
- mov v19.16b, v4.16b // out19
+ mov v19.16b, v7.16b // out19
- mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20
mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
- srshr v20.4s, v4.4s, #12 // t20
+ srshr v20.4s, v7.4s, #12 // t20
srshr v22.4s, v6.4s, #12 // t27
- mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a
+ mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a
mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
mov v27.16b, v22.16b // t27
- srshr v26.4s, v4.4s, #12 // t26a
+ srshr v26.4s, v7.4s, #12 // t26a
mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
- mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25
+ mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25
srshr v21.4s, v6.4s, #12 // t21a
srshr v22.4s, v24.4s, #12 // t22
- srshr v25.4s, v4.4s, #12 // t25
+ srshr v25.4s, v7.4s, #12 // t25
- mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
- srshr v23.4s, v4.4s, #12 // t23a
+ srshr v23.4s, v7.4s, #12 // t23a
srshr v24.4s, v6.4s, #12 // t24a
ret
@@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon
scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
.endif
bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
@@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon
sqsub v30.4s, v23.4s, v22.4s // t62
sqadd v31.4s, v23.4s, v22.4s // t63
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
- mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a
+ mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a
neg v2.4s, v2.4s // t34a
mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
srshr v26.4s, v2.4s, #12 // t34a
mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
- srshr v29.4s, v4.4s, #12 // t61a
+ srshr v29.4s, v7.4s, #12 // t61a
srshr v25.4s, v6.4s, #12 // t33a
srshr v30.4s, v2.4s, #12 // t62a
@@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon
sqsub v21.4s, v30.4s, v29.4s // t61
sqadd v22.4s, v30.4s, v29.4s // t62
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
- mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a
+ mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a
mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
srshr v21.4s, v2.4s, #12 // t61a
- srshr v18.4s, v4.4s, #12 // t34a
+ srshr v18.4s, v7.4s, #12 // t34a
mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
srshr v20.4s, v6.4s, #12 // t60
srshr v19.4s, v2.4s, #12 // t35
@@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon
sqadd v30.4s, v23.4s, v22.4s // t48
sqsub v31.4s, v23.4s, v22.4s // t55
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
- mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a
+ mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a
mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
srshr v25.4s, v2.4s, #12 // t56a
- srshr v27.4s, v4.4s, #12 // t39a
+ srshr v27.4s, v7.4s, #12 // t39a
neg v6.4s, v6.4s // t40a
mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
srshr v31.4s, v6.4s, #12 // t40a
@@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon
sqsub v21.4s, v25.4s, v28.4s // t55
sqadd v22.4s, v25.4s, v28.4s // t56
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
- mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a
+ mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a
mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
srshr v18.4s, v2.4s, #12 // t40a
- srshr v21.4s, v4.4s, #12 // t55a
+ srshr v21.4s, v7.4s, #12 // t55a
mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
srshr v19.4s, v6.4s, #12 // t47
srshr v20.4s, v2.4s, #12 // t48
@@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
bl inv_dct_4s_x16_neon
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
store16 x6
movz16dup_if v0.2s, w16, #2896*8, \scale
@@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
mov x9, #-16
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
.macro store_addsub r0, r1, r2, r3
ld1 {v2.4s}, [x6], #16
ld1 {v3.4s}, [x6], #16
@@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
ld1 {v4.4s}, [x6], #16
sqadd v7.4s, v3.4s, \r1
sqsub \r1, v3.4s, \r1
+ smin v6.4s, v6.4s, v1.4s
+ smin \r0, \r0, v1.4s
ld1 {v5.4s}, [x6], #16
sqadd v2.4s, v4.4s, \r2
sub x6, x6, #16*4
+ smax v6.4s, v6.4s, v0.4s
+ smax \r0, \r0, v0.4s
sqsub \r2, v4.4s, \r2
+ smin v7.4s, v7.4s, v1.4s
+ smin \r1, \r1, v1.4s
st1 {v6.4s}, [x6], #16
st1 {\r0}, [x10], x9
+ smin v2.4s, v2.4s, v1.4s
+ smin \r2, \r2, v1.4s
+ smax v7.4s, v7.4s, v0.4s
+ smax \r1, \r1, v0.4s
sqadd v3.4s, v5.4s, \r3
sqsub \r3, v5.4s, \r3
+ smax v2.4s, v2.4s, v0.4s
+ smax \r2, \r2, v0.4s
+ smin v3.4s, v3.4s, v1.4s
+ smin \r3, \r3, v1.4s
st1 {v7.4s}, [x6], #16
st1 {\r1}, [x10], x9
+ smax v3.4s, v3.4s, v0.4s
+ smax \r3, \r3, v0.4s
st1 {v2.4s}, [x6], #16
st1 {\r2}, [x10], x9
st1 {v3.4s}, [x6], #16
@@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
add x6, x6, #4*4*16
movrel x17, idct64_coeffs
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
movz16dup_if v0.2s, w16, #2896*8, \scale
movi_if v7.4s, #0, \clear
add x9, x7, x8, lsl #4 // offset 16
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S
index 2b9b5c408ec..63d5de10ada 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter.S
@@ -28,6 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
.macro loop_filter wd
function lpf_16_wd\wd\()_neon
uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0)
@@ -77,8 +82,10 @@ function lpf_16_wd\wd\()_neon
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
- b.eq 9f // if (!fm || wd < 4) return;
-
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
.if \wd >= 6
movi v10.16b, #1
uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0)
@@ -474,20 +481,20 @@ function lpf_16_wd\wd\()_neon
bif v11.16b, v29.16b, v15.16b // out q5
.endif
+ mov x14, #0
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
- ret x13
+ mov x14, #(1 << 6)
+ ret
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
- ret x14
+ mov x14, #(1 << 4)
+ ret
.endif
-9:
- // Return directly without writing back any pixels
- ret x15
endfunc
.endm
@@ -497,22 +504,34 @@ loop_filter 6
loop_filter 4
.macro lpf_16_wd16
- adr x13, 7f
- adr x14, 8f
bl lpf_16_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_16_wd8
- adr x14, 8f
bl lpf_16_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_16_wd6
bl lpf_16_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
.macro lpf_16_wd4
bl lpf_16_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
function lpf_v_4_16_neon
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S
index aab0230c44b..d181a3e6239 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/loopfilter16.S
@@ -28,6 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
.macro loop_filter wd
function lpf_8_wd\wd\()_neon
uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
@@ -77,8 +82,10 @@ function lpf_8_wd\wd\()_neon
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
- b.eq 9f // if (!fm || wd < 4) return;
-
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
.if \wd >= 6
movi v10.8h, #1
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
@@ -360,20 +367,20 @@ function lpf_8_wd\wd\()_neon
bif v11.16b, v29.16b, v15.16b // out q5
.endif
+ mov x14, #0
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
- ret x13
+ mov x14, #(1 << 6)
+ ret
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
- ret x14
+ mov x14, #(1 << 4)
+ ret
.endif
-9:
- // Return directly without writing back any pixels
- ret x15
endfunc
.endm
@@ -383,22 +390,34 @@ loop_filter 6
loop_filter 4
.macro lpf_8_wd16
- adr x13, 7f
- adr x14, 8f
bl lpf_8_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_8_wd8
- adr x14, 8f
bl lpf_8_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_8_wd6
bl lpf_8_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
.macro lpf_8_wd4
bl lpf_8_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
function lpf_v_4_8_neon
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S
index d1083c6b561..dc50415f1f1 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/asm.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/asm.S
@@ -135,6 +135,12 @@
#endif
#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+#elif defined(__APPLE__) && defined(__arm64e__)
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+
#else /* __ARM_FEATURE_PAC_DEFAULT */
#define GNU_PROPERTY_AARCH64_PAC 0
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h b/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h
new file mode 100644
index 00000000000..2e8c8ab6fb8
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/cdef.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
+
+void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+
+// Passing edges to this function, to allow it to switch to a more
+// optimized version for fully edged cases. Using size_t for edges,
+// to avoid ABI differences for passing more than one argument on the stack.
+void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+
+#define DEFINE_FILTER(w, h, tmp_stride) \
+static void \
+cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, const int sec_strength, \
+ const int dir, const int damping, \
+ const enum CdefEdgeFlags edges \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
+ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
+ BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, \
+ left, top, bottom, h, edges); \
+ BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \
+ sec_strength, dir, damping, h, edges \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+DEFINE_FILTER(8, 8, 16)
+DEFINE_FILTER(4, 8, 8)
+DEFINE_FILTER(4, 4, 8)
+
+static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->dir = BF(dav1d_cdef_find_dir, neon);
+ c->fb[0] = cdef_filter_8x8_neon;
+ c->fb[1] = cdef_filter_4x8_neon;
+ c->fb[2] = cdef_filter_4x4_neon;
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h
new file mode 100644
index 00000000000..48776ac8524
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/filmgrain.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+#include "asm-offsets.h"
+
+CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT);
+
+CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET);
+CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE);
+
+void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data
+ HIGHBD_DECL_SUFFIX);
+
+#define GEN_GRAIN_UV(suff) \
+void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
+ const entry buf_y[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data, \
+ const intptr_t uv \
+ HIGHBD_DECL_SUFFIX)
+
+GEN_GRAIN_UV(420);
+GEN_GRAIN_UV(422);
+GEN_GRAIN_UV(444);
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// same layout of parameters on the stack across platforms.
+void BF(dav1d_fgy_32x32, neon)(pixel *const dst,
+ const pixel *const src,
+ const ptrdiff_t stride,
+ const uint8_t scaling[SCALING_SIZE],
+ const int scaling_shift,
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int offsets[][2],
+ const int h, const ptrdiff_t clip,
+ const ptrdiff_t type
+ HIGHBD_DECL_SUFFIX);
+
+static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride,
+ const Dav1dFilmGrainData *const data, const size_t pw,
+ const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in BLOCK_SIZE^2 blocks
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
+
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ int type = 0;
+ if (data->overlap_flag && row_num)
+ type |= 1; /* overlap y */
+ if (data->overlap_flag && bx)
+ type |= 2; /* overlap x */
+
+ BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride,
+ scaling, data->scaling_shift,
+ grain_lut, offsets, bh,
+ data->clip_to_restricted_range, type
+ HIGHBD_TAIL_SUFFIX);
+ }
+}
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// parameters on the stack with the same layout across platforms.
+#define FGUV(nm, sx, sy) \
+void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \
+ const pixel *const src, \
+ const ptrdiff_t stride, \
+ const uint8_t scaling[SCALING_SIZE], \
+ const Dav1dFilmGrainData *const data, \
+ const entry grain_lut[][GRAIN_WIDTH], \
+ const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, \
+ const int offsets[][2], \
+ const ptrdiff_t h, const ptrdiff_t uv, \
+ const ptrdiff_t is_id, \
+ const ptrdiff_t type \
+ HIGHBD_DECL_SUFFIX); \
+static void \
+fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
+ const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \
+ const size_t pw, const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], const int bh, \
+ const int row_num, const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, const int uv, const int is_id \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ const int rows = 1 + (data->overlap_flag && row_num > 0); \
+ \
+ /* seed[0] contains the current row, seed[1] contains the previous */ \
+ unsigned seed[2]; \
+ for (int i = 0; i < rows; i++) { \
+ seed[i] = data->seed; \
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \
+ } \
+ \
+ int offsets[2 /* col offset */][2 /* row offset */]; \
+ \
+ /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
+ if (data->overlap_flag && bx) { \
+ /* shift previous offsets left */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[1][i] = offsets[0][i]; \
+ } \
+ \
+ /* update current offsets */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[0][i] = get_random_number(8, &seed[i]); \
+ \
+ int type = 0; \
+ if (data->overlap_flag && row_num) \
+ type |= 1; /* overlap y */ \
+ if (data->overlap_flag && bx) \
+ type |= 2; /* overlap x */ \
+ if (data->chroma_scaling_from_luma) \
+ type |= 4; \
+ \
+ BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \
+ scaling, data, grain_lut, \
+ luma_row + (bx << sx), luma_stride, \
+ offsets, bh, uv, is_id, type \
+ HIGHBD_TAIL_SUFFIX); \
+ } \
+}
+
+FGUV(420, 1, 1);
+FGUV(422, 1, 0);
+FGUV(444, 0, 0);
+
+static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon);
+
+ c->fgy_32x32xn = fgy_32x32xn_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon;
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h b/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h
new file mode 100644
index 00000000000..aef4daebbf1
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/ipred.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon));
+
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon));
+
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
+
+decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
+
+static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon);
+ c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon);
+ c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon);
+ c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon);
+ c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon);
+ c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon);
+ c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon);
+ c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon);
+ c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
+ c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
+ c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon);
+
+ c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon);
+ c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon);
+ c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon);
+ c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon);
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
+
+ c->pal_pred = BF(dav1d_pal_pred, neon);
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/itx.h b/chromium/third_party/dav1d/libdav1d/src/arm/itx.h
new file mode 100644
index 00000000000..2ecd086b3be
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/itx.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+decl_itx17_fns( 4, 4, neon);
+decl_itx16_fns( 4, 8, neon);
+decl_itx16_fns( 4, 16, neon);
+decl_itx16_fns( 8, 4, neon);
+decl_itx16_fns( 8, 8, neon);
+decl_itx16_fns( 8, 16, neon);
+decl_itx2_fns ( 8, 32, neon);
+decl_itx16_fns(16, 4, neon);
+decl_itx16_fns(16, 8, neon);
+decl_itx12_fns(16, 16, neon);
+decl_itx2_fns (16, 32, neon);
+decl_itx2_fns (32, 8, neon);
+decl_itx2_fns (32, 16, neon);
+decl_itx2_fns (32, 32, neon);
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
+
+static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ if (BITDEPTH == 16 && bpc != 10) return;
+
+ assign_itx17_fn( , 4, 4, neon);
+ assign_itx16_fn(R, 4, 8, neon);
+ assign_itx16_fn(R, 4, 16, neon);
+ assign_itx16_fn(R, 8, 4, neon);
+ assign_itx16_fn( , 8, 8, neon);
+ assign_itx16_fn(R, 8, 16, neon);
+ assign_itx2_fn (R, 8, 32, neon);
+ assign_itx16_fn(R, 16, 4, neon);
+ assign_itx16_fn(R, 16, 8, neon);
+ assign_itx12_fn( , 16, 16, neon);
+ assign_itx2_fn (R, 16, 32, neon);
+ assign_itx1_fn (R, 16, 64, neon);
+ assign_itx2_fn (R, 32, 8, neon);
+ assign_itx2_fn (R, 32, 16, neon);
+ assign_itx2_fn ( , 32, 32, neon);
+ assign_itx1_fn (R, 32, 64, neon);
+ assign_itx1_fn (R, 64, 16, neon);
+ assign_itx1_fn (R, 64, 32, neon);
+ assign_itx1_fn ( , 64, 64, neon);
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c b/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h
index a634da27c4e..9ac08d94d29 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/loopfilter.h
@@ -1,5 +1,6 @@
/*
- * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -25,19 +26,20 @@
*/
#include "src/cpu.h"
-#include "src/msac.h"
-#include "src/x86/msac.h"
+#include "src/loopfilter.h"
-#if ARCH_X86_64
-void dav1d_msac_init_x86(MsacContext *const s) {
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));
+
+static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
- if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
- s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
- }
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
- if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
- s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
- }
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
}
-#endif
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h
new file mode 100644
index 00000000000..7993dbff683
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/looprestoration.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#if ARCH_AARCH64
+void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+#else
+
+// The 8bpc version calculates things slightly differently than the reference
+// C version. That version calculates roughly this:
+// int16_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += src[idx] * fh[i];
+// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
+// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
+// sum += 1 << (bitdepth + 6 - round_bits_h);
+// Compared to the reference C version, this is the output of the first pass
+// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
+// with round_offset precompensated.
+// The 16bpc version calculates things pretty much the same way as the
+// reference C version, but with the end result subtracted by
+// 1 << (bitdepth + 6 - round_bits_h).
+void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
+ const pixel *src, ptrdiff_t stride,
+ const int16_t fh[8], intptr_t w,
+ int h, enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// int32_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += mid[idx] * fv[i];
+// sum = (sum + rounding_off_v) >> round_bits_v;
+// This function assumes that the width is a multiple of 8.
+void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
+ const int16_t *mid, int w, int h,
+ const int16_t fv[8], enum LrEdgeFlags edges,
+ ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
+
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+ ALIGN_STK_16(int16_t, mid, 68 * 384,);
+ int mid_stride = (w + 7) & ~7;
+
+ // Horizontal filter
+ BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
+ filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
+ filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
+ lpf + 6 * PXSTRIDE(stride),
+ stride, filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+
+ // Vertical filter
+ BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
+ w, h, filter[1], edges,
+ mid_stride * sizeof(*mid)
+ HIGHBD_TAIL_SUFFIX);
+}
+#endif
+
+void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 3x3 box (radius=1) */
+static void dav1d_sgr_filter1_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 5x5 box (radius=2) */
+static void dav1d_sgr_filter2_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int w, const int h,
+ const int wt HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int16_t *t2,
+ const int w, const int h,
+ const int16_t wt[2] HIGHBD_DECL_SUFFIX);
+
+static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+ ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
+ BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
+ tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
+}
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if ARCH_AARCH64
+ c->wiener[0] = BF(dav1d_wiener_filter7, neon);
+ c->wiener[1] = BF(dav1d_wiener_filter5, neon);
+#else
+ c->wiener[0] = c->wiener[1] = wiener_filter_neon;
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = sgr_filter_5x5_neon;
+ c->sgr[1] = sgr_filter_3x3_neon;
+ c->sgr[2] = sgr_filter_mix_neon;
+ }
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/mc.h b/chromium/third_party/dav1d/libdav1d/src/arm/mc.h
new file mode 100644
index 00000000000..06cd533a9b4
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/mc.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/mc.h"
+#include "src/cpu.h"
+
+decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
+decl_mc_fn(BF(dav1d_put_bilin, neon));
+
+decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_bilin, neon));
+
+decl_avg_fn(BF(dav1d_avg, neon));
+decl_w_avg_fn(BF(dav1d_w_avg, neon));
+decl_mask_fn(BF(dav1d_mask, neon));
+decl_blend_fn(BF(dav1d_blend, neon));
+decl_blend_dir_fn(BF(dav1d_blend_h, neon));
+decl_blend_dir_fn(BF(dav1d_blend_v, neon));
+
+decl_w_mask_fn(BF(dav1d_w_mask_444, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_422, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
+
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
+
+decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
+
+static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
+
+ c->avg = BF(dav1d_avg, neon);
+ c->w_avg = BF(dav1d_w_avg, neon);
+ c->mask = BF(dav1d_mask, neon);
+ c->blend = BF(dav1d_blend, neon);
+ c->blend_h = BF(dav1d_blend_h, neon);
+ c->blend_v = BF(dav1d_blend_v, neon);
+ c->w_mask[0] = BF(dav1d_w_mask_444, neon);
+ c->w_mask[1] = BF(dav1d_w_mask_422, neon);
+ c->w_mask[2] = BF(dav1d_w_mask_420, neon);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+ c->emu_edge = BF(dav1d_emu_edge, neon);
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c b/chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h
index acde030a368..4c96fc50952 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/refmvs_init.c
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/refmvs.h
@@ -30,7 +30,7 @@
decl_splat_mv_fn(dav1d_splat_mv_neon);
-COLD void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
+static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
diff --git a/chromium/third_party/dav1d/libdav1d/src/cdef.h b/chromium/third_party/dav1d/libdav1d/src/cdef.h
index 2a933d54ef9..07c84d9ff50 100644
--- a/chromium/third_party/dav1d/libdav1d/src/cdef.h
+++ b/chromium/third_party/dav1d/libdav1d/src/cdef.h
@@ -67,8 +67,5 @@ typedef struct Dav1dCdefDSPContext {
} Dav1dCdefDSPContext;
bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c);
-bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c);
-bitfn_decls(void dav1d_cdef_dsp_init_ppc, Dav1dCdefDSPContext *c);
-bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c);
#endif /* DAV1D_SRC_CDEF_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/cdef_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/cdef_tmpl.c
index 1c95dbf9141..59439457a18 100644
--- a/chromium/third_party/dav1d/libdav1d/src/cdef_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/cdef_tmpl.c
@@ -303,6 +303,16 @@ static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
return best_dir;
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/cdef.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/cdef.h"
+#elif ARCH_X86
+#include "src/x86/cdef.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
c->dir = cdef_find_dir_c;
c->fb[0] = cdef_filter_block_8x8_c;
@@ -311,11 +321,11 @@ COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_cdef_dsp_init_arm)(c);
+ cdef_dsp_init_arm(c);
#elif ARCH_PPC64LE
- bitfn(dav1d_cdef_dsp_init_ppc)(c);
+ cdef_dsp_init_ppc(c);
#elif ARCH_X86
- bitfn(dav1d_cdef_dsp_init_x86)(c);
+ cdef_dsp_init_x86(c);
#endif
#endif
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/cdf.c b/chromium/third_party/dav1d/libdav1d/src/cdf.c
index 8ac87fe0354..e0f2132e007 100644
--- a/chromium/third_party/dav1d/libdav1d/src/cdf.c
+++ b/chromium/third_party/dav1d/libdav1d/src/cdf.c
@@ -4118,7 +4118,6 @@ void dav1d_cdf_thread_ref(CdfThreadContext *const dst,
}
void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) {
- if (cdf->ref)
- dav1d_ref_dec(&cdf->ref);
- memset(cdf, 0, sizeof(*cdf));
+ memset(&cdf->data, 0, sizeof(*cdf) - offsetof(CdfThreadContext, data));
+ dav1d_ref_dec(&cdf->ref);
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/cpu.c b/chromium/third_party/dav1d/libdav1d/src/cpu.c
index 2e5e8d9036e..d24148c352e 100644
--- a/chromium/third_party/dav1d/libdav1d/src/cpu.c
+++ b/chromium/third_party/dav1d/libdav1d/src/cpu.c
@@ -48,28 +48,24 @@
#define cpu_set_t cpuset_t
#endif
-static unsigned flags = 0;
-static unsigned flags_mask = -1;
+unsigned dav1d_cpu_flags = 0U;
+unsigned dav1d_cpu_flags_mask = ~0U;
COLD void dav1d_init_cpu(void) {
#if HAVE_ASM && !__has_feature(memory_sanitizer)
// memory sanitizer is inherently incompatible with asm
#if ARCH_AARCH64 || ARCH_ARM
- flags = dav1d_get_cpu_flags_arm();
+ dav1d_cpu_flags = dav1d_get_cpu_flags_arm();
#elif ARCH_PPC64LE
- flags = dav1d_get_cpu_flags_ppc();
+ dav1d_cpu_flags = dav1d_get_cpu_flags_ppc();
#elif ARCH_X86
- flags = dav1d_get_cpu_flags_x86();
+ dav1d_cpu_flags = dav1d_get_cpu_flags_x86();
#endif
#endif
}
-COLD unsigned dav1d_get_cpu_flags(void) {
- return flags & flags_mask;
-}
-
COLD void dav1d_set_cpu_flags_mask(const unsigned mask) {
- flags_mask = mask;
+ dav1d_cpu_flags_mask = mask;
}
COLD int dav1d_num_logical_processors(Dav1dContext *const c) {
@@ -99,6 +95,7 @@ COLD int dav1d_num_logical_processors(Dav1dContext *const c) {
#elif defined(_SC_NPROCESSORS_ONLN)
return (int)sysconf(_SC_NPROCESSORS_ONLN);
#endif
- dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n");
+ if (c)
+ dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n");
return 1;
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/cpu.h b/chromium/third_party/dav1d/libdav1d/src/cpu.h
index b5c27f7a216..8f70fefe54f 100644
--- a/chromium/third_party/dav1d/libdav1d/src/cpu.h
+++ b/chromium/third_party/dav1d/libdav1d/src/cpu.h
@@ -1,6 +1,6 @@
/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -43,9 +43,60 @@
#include "src/x86/cpu.h"
#endif
+EXTERN unsigned dav1d_cpu_flags;
+EXTERN unsigned dav1d_cpu_flags_mask;
+
void dav1d_init_cpu(void);
-unsigned dav1d_get_cpu_flags(void);
DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
int dav1d_num_logical_processors(Dav1dContext *c);
+static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
+ unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
+
+#if TRIM_DSP_FUNCTIONS
+/* Since this function is inlined, unconditionally setting a flag here will
+ * enable dead code elimination in the calling function. */
+#if ARCH_AARCH64 || ARCH_ARM
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+ flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#endif
+#elif ARCH_PPC64LE
+#if defined(__VSX__)
+ flags |= DAV1D_PPC_CPU_FLAG_VSX;
+#endif
+#elif ARCH_X86
+#if defined(__AVX512F__) && defined(__AVX512CD__) && \
+ defined(__AVX512BW__) && defined(__AVX512DQ__) && \
+ defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
+ defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
+ defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
+ defined(__AVX512BITALG__) && defined(__GFNI__) && \
+ defined(__VAES__) && defined(__VPCLMULQDQ__)
+ flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
+ DAV1D_X86_CPU_FLAG_AVX2 |
+ DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__AVX2__)
+ flags |= DAV1D_X86_CPU_FLAG_AVX2 |
+ DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__SSE4_1__) || defined(__AVX__)
+ flags |= DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__SSSE3__)
+ flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif ARCH_X86_64 || defined(__SSE2__) || \
+ (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+ flags |= DAV1D_X86_CPU_FLAG_SSE2;
+#endif
+#endif
+#endif
+
+ return flags;
+}
+
#endif /* DAV1D_SRC_CPU_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/data.c b/chromium/third_party/dav1d/libdav1d/src/data.c
index fa6165ec721..8a1386ad95a 100644
--- a/chromium/third_party/dav1d/libdav1d/src/data.c
+++ b/chromium/third_party/dav1d/libdav1d/src/data.c
@@ -47,8 +47,9 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
buf->ref = dav1d_ref_create(sz);
if (!buf->ref) return NULL;
buf->data = buf->ref->const_data;
- buf->sz = buf->m.size = sz;
+ buf->sz = sz;
dav1d_data_props_set_defaults(&buf->m);
+ buf->m.size = sz;
return buf->ref->data;
}
@@ -66,8 +67,9 @@ int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie);
if (!buf->ref) return DAV1D_ERR(ENOMEM);
buf->data = ptr;
- buf->sz = buf->m.size = sz;
+ buf->sz = sz;
dav1d_data_props_set_defaults(&buf->m);
+ buf->m.size = sz;
return 0;
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/decode.c b/chromium/third_party/dav1d/libdav1d/src/decode.c
index 13d57060710..2c816338a9e 100644
--- a/chromium/third_party/dav1d/libdav1d/src/decode.c
+++ b/chromium/third_party/dav1d/libdav1d/src/decode.c
@@ -749,9 +749,9 @@ static inline void splat_intraref(const Dav1dContext *const c,
c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
}
-static inline void mc_lowest_px(int *const dst, const int by4, const int bh4,
- const int mvy, const int ss_ver,
- const struct ScalableMotionParams *const smp)
+static void mc_lowest_px(int *const dst, const int by4, const int bh4,
+ const int mvy, const int ss_ver,
+ const struct ScalableMotionParams *const smp)
{
const int v_mul = 4 >> ss_ver;
if (!smp->scale) {
@@ -766,14 +766,11 @@ static inline void mc_lowest_px(int *const dst, const int by4, const int bh4,
}
}
-static inline void affine_lowest_px(Dav1dTaskContext *const t,
- int *const dst, const int is_chroma,
- const uint8_t *const b_dim,
- const Dav1dWarpedMotionParams *const wmp)
+static ALWAYS_INLINE void affine_lowest_px(Dav1dTaskContext *const t, int *const dst,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp,
+ const int ss_ver, const int ss_hor)
{
- const Dav1dFrameContext *const f = t->f;
- const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
const int32_t *const mat = wmp->matrix;
@@ -792,6 +789,25 @@ static inline void affine_lowest_px(Dav1dTaskContext *const t,
}
}
+static NOINLINE void affine_lowest_px_luma(Dav1dTaskContext *const t, int *const dst,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp)
+{
+ affine_lowest_px(t, dst, b_dim, wmp, 0, 0);
+}
+
+static NOINLINE void affine_lowest_px_chroma(Dav1dTaskContext *const t, int *const dst,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp)
+{
+ const Dav1dFrameContext *const f = t->f;
+ assert(f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400);
+ if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I444)
+ affine_lowest_px_luma(t, dst, b_dim, wmp);
+ else
+ affine_lowest_px(t, dst, b_dim, wmp, f->cur.p.layout & DAV1D_PIXEL_LAYOUT_I420, 1);
+}
+
static void obmc_lowest_px(Dav1dTaskContext *const t,
int (*const dst)[2], const int is_chroma,
const uint8_t *const b_dim,
@@ -2071,11 +2087,14 @@ static int decode_b(Dav1dTaskContext *const t,
const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
&ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+ enum RectTxfmSize ytx = b->max_ytx, uvtx = b->uvtx;
+ if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
+ ytx = (enum RectTxfmSize) TX_4X4;
+ uvtx = (enum RectTxfmSize) TX_4X4;
+ }
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
t->bx, t->by, f->w4, f->h4, b->skip, bs,
- f->frame_hdr->segmentation.lossless[b->seg_id] ?
- (enum RectTxfmSize) TX_4X4 : b->max_ytx,
- tx_split, b->uvtx, f->cur.p.layout,
+ ytx, tx_split, uvtx, f->cur.p.layout,
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
@@ -2150,9 +2169,9 @@ static int decode_b(Dav1dTaskContext *const t,
((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
(b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
{
- affine_lowest_px(t, &lowest_px[b->ref[0]][0], 0, b_dim,
- b->motion_mode == MM_WARP ? &t->warpmv :
- &f->frame_hdr->gmv[b->ref[0]]);
+ affine_lowest_px_luma(t, &lowest_px[b->ref[0]][0], b_dim,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
} else {
mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y,
0, &f->svc[b->ref[0]][1]);
@@ -2203,9 +2222,9 @@ static int decode_b(Dav1dTaskContext *const t,
((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
(b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
{
- affine_lowest_px(t, &lowest_px[b->ref[0]][1], 1, b_dim,
- b->motion_mode == MM_WARP ? &t->warpmv :
- &f->frame_hdr->gmv[b->ref[0]]);
+ affine_lowest_px_chroma(t, &lowest_px[b->ref[0]][1], b_dim,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
} else {
mc_lowest_px(&lowest_px[b->ref[0]][1],
t->by & ~ss_ver, bh4 << (bh4 == ss_ver),
@@ -2220,8 +2239,8 @@ static int decode_b(Dav1dTaskContext *const t,
// y
for (int i = 0; i < 2; i++) {
if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
- affine_lowest_px(t, &lowest_px[b->ref[i]][0], 0, b_dim,
- &f->frame_hdr->gmv[b->ref[i]]);
+ affine_lowest_px_luma(t, &lowest_px[b->ref[i]][0], b_dim,
+ &f->frame_hdr->gmv[b->ref[i]]);
} else {
mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4,
b->mv[i].y, 0, &f->svc[b->ref[i]][1]);
@@ -2233,8 +2252,8 @@ static int decode_b(Dav1dTaskContext *const t,
if (b->inter_mode == GLOBALMV_GLOBALMV &&
imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
{
- affine_lowest_px(t, &lowest_px[b->ref[i]][1], 1, b_dim,
- &f->frame_hdr->gmv[b->ref[i]]);
+ affine_lowest_px_chroma(t, &lowest_px[b->ref[i]][1], b_dim,
+ &f->frame_hdr->gmv[b->ref[i]]);
} else {
mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4,
b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]);
@@ -3407,7 +3426,7 @@ void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) {
(size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
}
for (int i = 0; i < 7; i++) {
- if (f->refp[i].p.data[0])
+ if (f->refp[i].p.frame_hdr)
dav1d_thread_picture_unref(&f->refp[i]);
dav1d_ref_dec(&f->ref_mvs_ref[i]);
}
@@ -3440,13 +3459,12 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
// wait until all threads have completed
if (!res) {
if (f->c->n_tc > 1) {
- pthread_mutex_lock(&f->task_thread.ttd->lock);
res = dav1d_task_create_tile_sbrow(f, 0, 1);
+ pthread_mutex_lock(&f->task_thread.ttd->lock);
+ pthread_cond_signal(&f->task_thread.ttd->cond);
if (!res) {
- const int uses_2pass = f->c->n_fc > 1;
while (!f->task_thread.done[0] ||
- (uses_2pass && !f->task_thread.done[1]) ||
- f->task_thread.task_counter > 0)
+ atomic_load(&f->task_thread.task_counter) > 0)
{
pthread_cond_wait(&f->task_thread.cond,
&f->task_thread.ttd->lock);
@@ -3469,7 +3487,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
static int get_upscale_x0(const int in_w, const int out_w, const int step) {
const int err = out_w * step - (in_w << 14);
- const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
+ const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err / 2);
return x0 & 0x3fff;
}
@@ -3491,10 +3509,13 @@ int dav1d_submit_frame(Dav1dContext *const c) {
&c->task_thread.lock);
out_delayed = &c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
- if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
+ unsigned first = atomic_load(&c->task_thread.first);
+ if (first + 1U < c->n_fc)
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
+ atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+ &first, UINT_MAX);
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
@@ -3706,7 +3727,8 @@ int dav1d_submit_frame(Dav1dContext *const c) {
const int uses_2pass = c->n_fc > 1;
const int cols = f->frame_hdr->tiling.cols;
const int rows = f->frame_hdr->tiling.rows;
- f->task_thread.task_counter = (cols * rows + f->sbh) << uses_2pass;
+ atomic_store(&f->task_thread.task_counter,
+ (cols * rows + f->sbh) << uses_2pass);
// ref_mvs
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
@@ -3726,9 +3748,10 @@ int dav1d_submit_frame(Dav1dContext *const c) {
if (f->frame_hdr->use_ref_frame_mvs) {
for (int i = 0; i < 7; i++) {
const int refidx = f->frame_hdr->refidx[i];
+ const int ref_w = ((ref_coded_width[i] + 7) >> 3) << 1;
+ const int ref_h = ((f->refp[i].p.p.h + 7) >> 3) << 1;
if (c->refs[refidx].refmvs != NULL &&
- ref_coded_width[i] == f->cur.p.w &&
- f->refp[i].p.p.h == f->cur.p.h)
+ ref_w == f->bw && ref_h == f->bh)
{
f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
dav1d_ref_inc(f->ref_mvs_ref[i]);
@@ -3809,7 +3832,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
for (int i = 0; i < 8; i++) {
if (refresh_frame_flags & (1 << i)) {
- if (c->refs[i].p.p.data[0])
+ if (c->refs[i].p.p.frame_hdr)
dav1d_thread_picture_unref(&c->refs[i].p);
dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur);
@@ -3839,7 +3862,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
dav1d_thread_picture_unref(&c->out);
for (int i = 0; i < 8; i++) {
if (refresh_frame_flags & (1 << i)) {
- if (c->refs[i].p.p.data[0])
+ if (c->refs[i].p.p.frame_hdr)
dav1d_thread_picture_unref(&c->refs[i].p);
dav1d_cdf_thread_unref(&c->cdf[i]);
dav1d_ref_dec(&c->refs[i].segmap);
@@ -3860,7 +3883,7 @@ error:
if (f->frame_hdr->refresh_context)
dav1d_cdf_thread_unref(&f->out_cdf);
for (int i = 0; i < 7; i++) {
- if (f->refp[i].p.data[0])
+ if (f->refp[i].p.frame_hdr)
dav1d_thread_picture_unref(&f->refp[i]);
dav1d_ref_dec(&f->ref_mvs_ref[i]);
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/dequant_tables.h b/chromium/third_party/dav1d/libdav1d/src/dequant_tables.h
index 4f555957130..17763377bc9 100644
--- a/chromium/third_party/dav1d/libdav1d/src/dequant_tables.h
+++ b/chromium/third_party/dav1d/libdav1d/src/dequant_tables.h
@@ -32,6 +32,6 @@
#include "src/levels.h"
-extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
+EXTERN const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
#endif /* DAV1D_SRC_DEQUANT_TABLES_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/fg_apply_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/fg_apply_tmpl.c
index ee14db9a4ce..581bcb72f5d 100644
--- a/chromium/third_party/dav1d/libdav1d/src/fg_apply_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/fg_apply_tmpl.c
@@ -51,6 +51,11 @@ static void generate_scaling(const int bitdepth,
const int scaling_size = 1 << bitdepth;
#endif
+ if (num == 0) {
+ memset(scaling, 0, scaling_size);
+ return;
+ }
+
// Fill up the preceding entries with the initial value
memset(scaling, points[0][1], points[0][0] << shift_x);
@@ -113,7 +118,7 @@ void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp,
data, 1 HIGHBD_TAIL_SUFFIX);
// Generate scaling LUTs as needed
- if (data->num_y_points)
+ if (data->num_y_points || data->chroma_scaling_from_luma)
generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
if (data->num_uv_points[0])
generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
diff --git a/chromium/third_party/dav1d/libdav1d/src/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/filmgrain.h
index d953542a82a..a5d6be6d44f 100644
--- a/chromium/third_party/dav1d/libdav1d/src/filmgrain.h
+++ b/chromium/third_party/dav1d/libdav1d/src/filmgrain.h
@@ -64,7 +64,7 @@ typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
#define decl_fguv_32x32xn_fn(name) \
void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
- const Dav1dFilmGrainData *data, int pw, \
+ const Dav1dFilmGrainData *data, size_t pw, \
const uint8_t scaling[SCALING_SIZE], \
const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
const pixel *luma_row, ptrdiff_t luma_stride, \
@@ -80,7 +80,5 @@ typedef struct Dav1dFilmGrainDSPContext {
} Dav1dFilmGrainDSPContext;
bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
-bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c);
-bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c);
#endif /* DAV1D_SRC_FILM_GRAIN_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/filmgrain_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/filmgrain_tmpl.c
index 883c5cbb7b9..0986ac2a58c 100644
--- a/chromium/third_party/dav1d/libdav1d/src/filmgrain_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/filmgrain_tmpl.c
@@ -278,7 +278,7 @@ static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
static NOINLINE void
fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
- const int pw, const uint8_t scaling[SCALING_SIZE],
+ const size_t pw, const uint8_t scaling[SCALING_SIZE],
const entry grain_lut[][GRAIN_WIDTH], const int bh,
const int row_num, const pixel *const luma_row,
const ptrdiff_t luma_stride, const int uv, const int is_id,
@@ -311,8 +311,8 @@ fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
int offsets[2 /* col offset */][2 /* row offset */];
// process this row in BLOCK_SIZE^2 blocks (subsampled)
- for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
- const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
+ const int bw = imin(BLOCK_SIZE >> sx, (int)(pw - bx));
if (data->overlap_flag && bx) {
// shift previous offsets left
for (int i = 0; i < rows; i++)
@@ -412,6 +412,14 @@ fguv_ss_fn(420, 1, 1);
fguv_ss_fn(422, 1, 0);
fguv_ss_fn(444, 0, 0);
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/filmgrain.h"
+#elif ARCH_X86
+#include "src/x86/filmgrain.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
c->generate_grain_y = generate_grain_y_c;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
@@ -425,9 +433,9 @@ COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_film_grain_dsp_init_arm)(c);
+ film_grain_dsp_init_arm(c);
#elif ARCH_X86
- bitfn(dav1d_film_grain_dsp_init_x86)(c);
+ film_grain_dsp_init_x86(c);
#endif
#endif
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/getbits.c b/chromium/third_party/dav1d/libdav1d/src/getbits.c
index 7bb20140e41..673070be3dd 100644
--- a/chromium/third_party/dav1d/libdav1d/src/getbits.c
+++ b/chromium/third_party/dav1d/libdav1d/src/getbits.c
@@ -36,51 +36,62 @@
void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data,
const size_t sz)
{
- // If sz were 0, c->eof would need to be initialized to 1.
assert(sz);
c->ptr = c->ptr_start = data;
c->ptr_end = &c->ptr_start[sz];
- c->bits_left = 0;
c->state = 0;
+ c->bits_left = 0;
c->error = 0;
- c->eof = 0;
}
-static void refill(GetBits *const c, const unsigned n) {
- assert(c->bits_left <= 56);
- uint64_t state = 0;
- do {
- state <<= 8;
- c->bits_left += 8;
- if (!c->eof)
- state |= *c->ptr++;
+unsigned dav1d_get_bit(GetBits *const c) {
+ if (!c->bits_left) {
if (c->ptr >= c->ptr_end) {
- c->error = c->eof;
- c->eof = 1;
+ c->error = 1;
+ } else {
+ const unsigned state = *c->ptr++;
+ c->bits_left = 7;
+ c->state = (uint64_t) state << 57;
+ return state >> 7;
}
- } while (n > c->bits_left);
- c->state |= state << (64 - c->bits_left);
-}
-
-unsigned dav1d_get_bits(GetBits *const c, const unsigned n) {
- assert(n <= 32 /* can go up to 57 if we change return type */);
- assert(n /* can't shift state by 64 */);
-
- if (n > c->bits_left) refill(c, n);
+ }
const uint64_t state = c->state;
- c->bits_left -= n;
- c->state <<= n;
+ c->bits_left--;
+ c->state = state << 1;
+ return (unsigned) (state >> 63);
+}
- return (unsigned) (state >> (64 - n));
+static inline void refill(GetBits *const c, const int n) {
+ assert(c->bits_left >= 0 && c->bits_left < 32);
+ unsigned state = 0;
+ do {
+ if (c->ptr >= c->ptr_end) {
+ c->error = 1;
+ if (state) break;
+ return;
+ }
+ state = (state << 8) | *c->ptr++;
+ c->bits_left += 8;
+ } while (n > c->bits_left);
+ c->state |= (uint64_t) state << (64 - c->bits_left);
}
-int dav1d_get_sbits(GetBits *const c, const unsigned n) {
- const int shift = 31 - n;
- const int res = dav1d_get_bits(c, n + 1) << shift;
- return res >> shift;
+#define GET_BITS(name, type, type64) \
+type name(GetBits *const c, const int n) { \
+ assert(n > 0 && n <= 32); \
+ /* Unsigned cast avoids refill after eob */ \
+ if ((unsigned) n > (unsigned) c->bits_left) \
+ refill(c, n); \
+ const uint64_t state = c->state; \
+ c->bits_left -= n; \
+ c->state = state << n; \
+ return (type) ((type64) state >> (64 - n)); \
}
+GET_BITS(dav1d_get_bits, unsigned, uint64_t)
+GET_BITS(dav1d_get_sbits, int, int64_t)
+
unsigned dav1d_get_uleb128(GetBits *const c) {
uint64_t val = 0;
unsigned i = 0, more;
@@ -108,15 +119,20 @@ unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) {
assert(l > 1);
const unsigned m = (1U << l) - max;
const unsigned v = dav1d_get_bits(c, l - 1);
- return v < m ? v : (v << 1) - m + dav1d_get_bits(c, 1);
+ return v < m ? v : (v << 1) - m + dav1d_get_bit(c);
}
unsigned dav1d_get_vlc(GetBits *const c) {
+ if (dav1d_get_bit(c))
+ return 0;
+
int n_bits = 0;
- while (!dav1d_get_bits(c, 1))
+ do {
if (++n_bits == 32)
return 0xFFFFFFFFU;
- return n_bits ? ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits) : 0;
+ } while (!dav1d_get_bit(c));
+
+ return ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits);
}
static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref,
@@ -132,7 +148,7 @@ static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref,
break;
}
- if (!dav1d_get_bits(c, 1)) {
+ if (!dav1d_get_bit(c)) {
v += dav1d_get_bits(c, b);
break;
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/getbits.h b/chromium/third_party/dav1d/libdav1d/src/getbits.h
index fc382148b2e..57b80dc7143 100644
--- a/chromium/third_party/dav1d/libdav1d/src/getbits.h
+++ b/chromium/third_party/dav1d/libdav1d/src/getbits.h
@@ -32,15 +32,15 @@
#include <stdint.h>
typedef struct GetBits {
- int error, eof;
uint64_t state;
- unsigned bits_left;
+ int bits_left, error;
const uint8_t *ptr, *ptr_start, *ptr_end;
} GetBits;
void dav1d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz);
-unsigned dav1d_get_bits(GetBits *c, unsigned n);
-int dav1d_get_sbits(GetBits *c, unsigned n);
+unsigned dav1d_get_bit(GetBits *c);
+unsigned dav1d_get_bits(GetBits *c, int n);
+int dav1d_get_sbits(GetBits *c, int n);
unsigned dav1d_get_uleb128(GetBits *c);
// Output in range 0..max-1
diff --git a/chromium/third_party/dav1d/libdav1d/src/internal.h b/chromium/third_party/dav1d/libdav1d/src/internal.h
index eceda98eca4..b5fd1e18ef3 100644
--- a/chromium/third_party/dav1d/libdav1d/src/internal.h
+++ b/chromium/third_party/dav1d/libdav1d/src/internal.h
@@ -194,6 +194,7 @@ struct Dav1dContext {
int strict_std_compliance;
int output_invisible_frames;
enum Dav1dInloopFilterType inloop_filters;
+ enum Dav1dDecodeFrameType decode_frame_type;
int drain;
enum PictureFlags frame_flags;
enum Dav1dEventFlags event_flags;
@@ -275,7 +276,7 @@ struct Dav1dFrameContext {
struct {
int next_tile_row[2 /* 0: reconstruction, 1: entropy */];
- int entropy_progress;
+ atomic_int entropy_progress;
atomic_int deblock_progress; // in sby units
atomic_uint *frame_progress, *copy_lpf_progress;
// indexed using t->by * f->b4_stride + t->bx
@@ -324,22 +325,28 @@ struct Dav1dFrameContext {
} lf;
struct {
+ pthread_mutex_t lock;
pthread_cond_t cond;
struct TaskThreadData *ttd;
struct Dav1dTask *tasks, *tile_tasks[2], init_task;
int num_tasks, num_tile_tasks;
- int init_done;
- int done[2];
+ atomic_int init_done;
+ atomic_int done[2];
int retval;
int update_set; // whether we need to update CDF reference
atomic_int error;
- int task_counter;
+ atomic_int task_counter;
struct Dav1dTask *task_head, *task_tail;
// Points to the task directly before the cur pointer in the queue.
// This cur pointer is theoretical here, we actually keep track of the
// "prev_t" variable. This is needed to not loose the tasks in
// [head;cur-1] when picking one for execution.
struct Dav1dTask *task_cur_prev;
+ struct { // async task insertion
+ atomic_int merge;
+ pthread_mutex_t lock;
+ Dav1dTask *head, *tail;
+ } pending_tasks;
} task_thread;
// threading (refer to tc[] for per-thread things)
diff --git a/chromium/third_party/dav1d/libdav1d/src/ipred.h b/chromium/third_party/dav1d/libdav1d/src/ipred.h
index 8664f3f993c..739ef1a266f 100644
--- a/chromium/third_party/dav1d/libdav1d/src/ipred.h
+++ b/chromium/third_party/dav1d/libdav1d/src/ipred.h
@@ -90,7 +90,5 @@ typedef struct Dav1dIntraPredDSPContext {
} Dav1dIntraPredDSPContext;
bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);
-bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c);
-bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c);
#endif /* DAV1D_SRC_IPRED_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/ipred_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/ipred_tmpl.c
index 50c7a3c7bee..151d4842a04 100644
--- a/chromium/third_party/dav1d/libdav1d/src/ipred_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/ipred_tmpl.c
@@ -726,6 +726,14 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
}
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/ipred.h"
+#elif ARCH_X86
+#include "src/x86/ipred.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
c->intra_pred[DC_PRED ] = ipred_dc_c;
c->intra_pred[DC_128_PRED ] = ipred_dc_128_c;
@@ -755,9 +763,9 @@ COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_intra_pred_dsp_init_arm)(c);
+ intra_pred_dsp_init_arm(c);
#elif ARCH_X86
- bitfn(dav1d_intra_pred_dsp_init_x86)(c);
+ intra_pred_dsp_init_x86(c);
#endif
#endif
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/itx.h b/chromium/third_party/dav1d/libdav1d/src/itx.h
index 08f5e212853..d522079907e 100644
--- a/chromium/third_party/dav1d/libdav1d/src/itx.h
+++ b/chromium/third_party/dav1d/libdav1d/src/itx.h
@@ -44,7 +44,5 @@ typedef struct Dav1dInvTxfmDSPContext {
} Dav1dInvTxfmDSPContext;
bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc);
-bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc);
-bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c, int bpc);
#endif /* DAV1D_SRC_ITX_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c
index 2f97a9cd798..d3859892d8b 100644
--- a/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/itx_tmpl.c
@@ -180,6 +180,14 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
dst[x] = iclip_pixel(dst[x] + *c++);
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/itx.h"
+#elif ARCH_X86
+#include "src/x86/itx.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
#define assign_itx_all_fn64(w, h, pfx) \
c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \
@@ -247,10 +255,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_itx_dsp_init_arm)(c, bpc);
+ itx_dsp_init_arm(c, bpc);
#endif
#if ARCH_X86
- bitfn(dav1d_itx_dsp_init_x86)(c, bpc);
+ itx_dsp_init_x86(c, bpc);
#endif
#endif
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/lf_mask.c b/chromium/third_party/dav1d/libdav1d/src/lf_mask.c
index 411c88400e7..91fe4a02c8d 100644
--- a/chromium/third_party/dav1d/libdav1d/src/lf_mask.c
+++ b/chromium/third_party/dav1d/libdav1d/src/lf_mask.c
@@ -212,13 +212,13 @@ static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2],
#undef set_ctx
}
-static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
- const int cby4, const int cbx4,
- const int cw4, const int ch4,
- const int skip_inter,
- const enum RectTxfmSize tx,
- uint8_t *const a, uint8_t *const l,
- const int ss_hor, const int ss_ver)
+static void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
+ const int cby4, const int cbx4,
+ const int cw4, const int ch4,
+ const int skip_inter,
+ const enum RectTxfmSize tx,
+ uint8_t *const a, uint8_t *const l,
+ const int ss_hor, const int ss_ver)
{
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
const int twl4 = t_dim->lw, thl4 = t_dim->lh;
@@ -424,16 +424,14 @@ void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
lim_lut->sharp[1] = sharp ? 9 - sharp : 0xff;
}
-static inline void calc_lf_value(uint8_t (*const lflvl_values)[2],
- const int is_chroma, const int base_lvl,
- const int lf_delta, const int seg_delta,
- const Dav1dLoopfilterModeRefDeltas *const mr_delta)
+static void calc_lf_value(uint8_t (*const lflvl_values)[2],
+ const int base_lvl, const int lf_delta,
+ const int seg_delta,
+ const Dav1dLoopfilterModeRefDeltas *const mr_delta)
{
const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63);
- if (!base_lvl && is_chroma) {
- memset(lflvl_values, 0, 8 * 2);
- } else if (!mr_delta) {
+ if (!mr_delta) {
memset(lflvl_values, base, 8 * 2);
} else {
const int sh = base >= 32;
@@ -449,6 +447,17 @@ static inline void calc_lf_value(uint8_t (*const lflvl_values)[2],
}
}
+static inline void calc_lf_value_chroma(uint8_t (*const lflvl_values)[2],
+ const int base_lvl, const int lf_delta,
+ const int seg_delta,
+ const Dav1dLoopfilterModeRefDeltas *const mr_delta)
+{
+ if (!base_lvl)
+ memset(lflvl_values, 0, 8 * 2);
+ else
+ calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta);
+}
+
void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
const Dav1dFrameHeader *const hdr,
const int8_t lf_delta[4])
@@ -467,16 +476,16 @@ void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
const Dav1dSegmentationData *const segd =
hdr->segmentation.enabled ? &hdr->segmentation.seg_data.d[s] : NULL;
- calc_lf_value(lflvl_values[s][0], 0, hdr->loopfilter.level_y[0],
+ calc_lf_value(lflvl_values[s][0], hdr->loopfilter.level_y[0],
lf_delta[0], segd ? segd->delta_lf_y_v : 0, mr_deltas);
- calc_lf_value(lflvl_values[s][1], 0, hdr->loopfilter.level_y[1],
+ calc_lf_value(lflvl_values[s][1], hdr->loopfilter.level_y[1],
lf_delta[hdr->delta.lf.multi ? 1 : 0],
segd ? segd->delta_lf_y_h : 0, mr_deltas);
- calc_lf_value(lflvl_values[s][2], 1, hdr->loopfilter.level_u,
- lf_delta[hdr->delta.lf.multi ? 2 : 0],
- segd ? segd->delta_lf_u : 0, mr_deltas);
- calc_lf_value(lflvl_values[s][3], 1, hdr->loopfilter.level_v,
- lf_delta[hdr->delta.lf.multi ? 3 : 0],
- segd ? segd->delta_lf_v : 0, mr_deltas);
+ calc_lf_value_chroma(lflvl_values[s][2], hdr->loopfilter.level_u,
+ lf_delta[hdr->delta.lf.multi ? 2 : 0],
+ segd ? segd->delta_lf_u : 0, mr_deltas);
+ calc_lf_value_chroma(lflvl_values[s][3], hdr->loopfilter.level_v,
+ lf_delta[hdr->delta.lf.multi ? 3 : 0],
+ segd ? segd->delta_lf_v : 0, mr_deltas);
}
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/lib.c b/chromium/third_party/dav1d/libdav1d/src/lib.c
index b21a735964f..396a57c98f4 100644
--- a/chromium/third_party/dav1d/libdav1d/src/lib.c
+++ b/chromium/third_party/dav1d/libdav1d/src/lib.c
@@ -77,6 +77,7 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {
s->strict_std_compliance = 0;
s->output_invisible_frames = 0;
s->inloop_filters = DAV1D_INLOOPFILTER_ALL;
+ s->decode_frame_type = DAV1D_DECODEFRAMETYPE_ALL;
}
static void close_internal(Dav1dContext **const c_out, int flush);
@@ -97,6 +98,37 @@ static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_at
return 0;
}
+static COLD void get_num_threads(Dav1dContext *const c, const Dav1dSettings *const s,
+ unsigned *n_tc, unsigned *n_fc)
+{
+ /* ceil(sqrt(n)) */
+ static const uint8_t fc_lut[49] = {
+ 1, /* 1 */
+ 2, 2, 2, /* 2- 4 */
+ 3, 3, 3, 3, 3, /* 5- 9 */
+ 4, 4, 4, 4, 4, 4, 4, /* 10-16 */
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 17-25 */
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 26-36 */
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 37-49 */
+ };
+ *n_tc = s->n_threads ? s->n_threads :
+ iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS);
+ *n_fc = s->max_frame_delay ? umin(s->max_frame_delay, *n_tc) :
+ *n_tc < 50 ? fc_lut[*n_tc - 1] : 8; // min(8, ceil(sqrt(n)))
+}
+
+COLD int dav1d_get_frame_delay(const Dav1dSettings *const s) {
+ unsigned n_tc, n_fc;
+ validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->n_threads >= 0 &&
+ s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->max_frame_delay >= 0 &&
+ s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL));
+
+ get_num_threads(NULL, s, &n_tc, &n_fc);
+ return n_fc;
+}
+
COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
static pthread_once_t initted = PTHREAD_ONCE_INIT;
pthread_once(&initted, init_internal);
@@ -113,6 +145,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
DAV1D_ERR(EINVAL));
validate_input_or_ret(s->operating_point >= 0 &&
s->operating_point <= 31, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->decode_frame_type >= DAV1D_DECODEFRAMETYPE_ALL &&
+ s->decode_frame_type <= DAV1D_DECODEFRAMETYPE_KEY, DAV1D_ERR(EINVAL));
pthread_attr_t thread_attr;
if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
@@ -133,6 +167,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
c->strict_std_compliance = s->strict_std_compliance;
c->output_invisible_frames = s->output_invisible_frames;
c->inloop_filters = s->inloop_filters;
+ c->decode_frame_type = s->decode_frame_type;
dav1d_data_props_set_defaults(&c->cached_error_props);
@@ -171,20 +206,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
c->flush = &c->flush_mem;
atomic_init(c->flush, 0);
- c->n_tc = s->n_threads ? s->n_threads :
- iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS);
- /* ceil(sqrt(n)) */
- static const uint8_t fc_lut[49] = {
- 1, /* 1 */
- 2, 2, 2, /* 2- 4 */
- 3, 3, 3, 3, 3, /* 5- 9 */
- 4, 4, 4, 4, 4, 4, 4, /* 10-16 */
- 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 17-25 */
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 26-36 */
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 37-49 */
- };
- c->n_fc = s->max_frame_delay ? umin(s->max_frame_delay, c->n_tc) :
- c->n_tc < 50 ? fc_lut[c->n_tc - 1] : 8; // min(8, ceil(sqrt(n)))
+ get_num_threads(c, s, &c->n_tc, &c->n_fc);
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32);
if (!c->fc) goto error;
@@ -217,8 +239,18 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
}
for (unsigned n = 0; n < c->n_fc; n++) {
Dav1dFrameContext *const f = &c->fc[n];
- if (c->n_tc > 1)
- if (pthread_cond_init(&f->task_thread.cond, NULL)) goto error;
+ if (c->n_tc > 1) {
+ if (pthread_mutex_init(&f->task_thread.lock, NULL)) goto error;
+ if (pthread_cond_init(&f->task_thread.cond, NULL)) {
+ pthread_mutex_destroy(&f->task_thread.lock);
+ goto error;
+ }
+ if (pthread_mutex_init(&f->task_thread.pending_tasks.lock, NULL)) {
+ pthread_cond_destroy(&f->task_thread.cond);
+ pthread_mutex_destroy(&f->task_thread.lock);
+ goto error;
+ }
+ }
f->c = c;
f->task_thread.ttd = &c->task_thread;
f->lf.last_sharpness = -1;
@@ -317,7 +349,8 @@ static int has_grain(const Dav1dPicture *const pic)
{
const Dav1dFilmGrainData *fgdata = &pic->frame_hdr->film_grain.data;
return fgdata->num_y_points || fgdata->num_uv_points[0] ||
- fgdata->num_uv_points[1];
+ fgdata->num_uv_points[1] || (fgdata->clip_to_restricted_range &&
+ fgdata->chroma_scaling_from_luma);
}
static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
@@ -374,10 +407,13 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
Dav1dThreadPicture *const out_delayed =
&c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
- if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
+ unsigned first = atomic_load(&c->task_thread.first);
+ if (first + 1U < c->n_fc)
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
+ atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+ &first, UINT_MAX);
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
@@ -529,16 +565,16 @@ error:
void dav1d_flush(Dav1dContext *const c) {
dav1d_data_unref_internal(&c->in);
- if (c->out.p.data[0])
+ if (c->out.p.frame_hdr)
dav1d_thread_picture_unref(&c->out);
- if (c->cache.p.data[0])
+ if (c->cache.p.frame_hdr)
dav1d_thread_picture_unref(&c->cache);
c->drain = 0;
c->cached_error = 0;
for (int i = 0; i < 8; i++) {
- if (c->refs[i].p.p.data[0])
+ if (c->refs[i].p.p.frame_hdr)
dav1d_thread_picture_unref(&c->refs[i].p);
dav1d_ref_dec(&c->refs[i].segmap);
dav1d_ref_dec(&c->refs[i].refmvs);
@@ -573,6 +609,9 @@ void dav1d_flush(Dav1dContext *const c) {
c->fc[i].task_thread.task_head = NULL;
c->fc[i].task_thread.task_tail = NULL;
c->fc[i].task_thread.task_cur_prev = NULL;
+ c->fc[i].task_thread.pending_tasks.head = NULL;
+ c->fc[i].task_thread.pending_tasks.tail = NULL;
+ atomic_init(&c->fc[i].task_thread.pending_tasks.merge, 0);
}
atomic_init(&c->task_thread.first, 0);
c->task_thread.cur = c->n_fc;
@@ -590,7 +629,7 @@ void dav1d_flush(Dav1dContext *const c) {
f->n_tile_data = 0;
f->task_thread.retval = 0;
Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next];
- if (out_delayed->p.data[0]) {
+ if (out_delayed->p.frame_hdr) {
dav1d_thread_picture_unref(out_delayed);
}
}
@@ -646,7 +685,9 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
freep(&f->frame_thread.cbi);
}
if (c->n_tc > 1) {
+ pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
pthread_cond_destroy(&f->task_thread.cond);
+ pthread_mutex_destroy(&f->task_thread.lock);
}
freep(&f->frame_thread.frame_progress);
freep(&f->task_thread.tasks);
@@ -667,7 +708,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
dav1d_free_aligned(c->fc);
if (c->n_fc > 1 && c->frame_thread.out_delayed) {
for (unsigned n = 0; n < c->n_fc; n++)
- if (c->frame_thread.out_delayed[n].p.data[0])
+ if (c->frame_thread.out_delayed[n].p.frame_hdr)
dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
free(c->frame_thread.out_delayed);
}
@@ -676,7 +717,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
free(c->tile);
for (int n = 0; n < 8; n++) {
dav1d_cdf_thread_unref(&c->cdf[n]);
- if (c->refs[n].p.p.data[0])
+ if (c->refs[n].p.p.frame_hdr)
dav1d_thread_picture_unref(&c->refs[n].p);
dav1d_ref_dec(&c->refs[n].refmvs);
dav1d_ref_dec(&c->refs[n].segmap);
diff --git a/chromium/third_party/dav1d/libdav1d/src/loopfilter.h b/chromium/third_party/dav1d/libdav1d/src/loopfilter.h
index c159050b26a..a0f78c96574 100644
--- a/chromium/third_party/dav1d/libdav1d/src/loopfilter.h
+++ b/chromium/third_party/dav1d/libdav1d/src/loopfilter.h
@@ -53,7 +53,5 @@ typedef struct Dav1dLoopFilterDSPContext {
} Dav1dLoopFilterDSPContext;
bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
-bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c);
-bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c);
#endif /* DAV1D_SRC_LOOPFILTER_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/loopfilter_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/loopfilter_tmpl.c
index 6ea744f37bc..cacf2587564 100644
--- a/chromium/third_party/dav1d/libdav1d/src/loopfilter_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/loopfilter_tmpl.c
@@ -244,6 +244,14 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
}
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/loopfilter.h"
+#elif ARCH_X86
+#include "src/x86/loopfilter.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
@@ -252,9 +260,9 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c)
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_loop_filter_dsp_init_arm)(c);
+ loop_filter_dsp_init_arm(c);
#elif ARCH_X86
- bitfn(dav1d_loop_filter_dsp_init_x86)(c);
+ loop_filter_dsp_init_x86(c);
#endif
#endif
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/looprestoration.h
index d0ab8110eb8..f55dd319471 100644
--- a/chromium/third_party/dav1d/libdav1d/src/looprestoration.h
+++ b/chromium/third_party/dav1d/libdav1d/src/looprestoration.h
@@ -75,8 +75,5 @@ typedef struct Dav1dLoopRestorationDSPContext {
} Dav1dLoopRestorationDSPContext;
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
-bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc);
-bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c, int bpc);
-bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c, int bpc);
#endif /* DAV1D_SRC_LOOPRESTORATION_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/looprestoration_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/looprestoration_tmpl.c
index 254c25d036f..d4d7867dba5 100644
--- a/chromium/third_party/dav1d/libdav1d/src/looprestoration_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/looprestoration_tmpl.c
@@ -524,6 +524,16 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t stride,
}
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/looprestoration.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/looprestoration.h"
+#elif ARCH_X86
+#include "src/x86/looprestoration.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
const int bpc)
{
@@ -534,11 +544,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc);
+ loop_restoration_dsp_init_arm(c, bpc);
#elif ARCH_PPC64LE
- bitfn(dav1d_loop_restoration_dsp_init_ppc)(c, bpc);
+ loop_restoration_dsp_init_ppc(c, bpc);
#elif ARCH_X86
- bitfn(dav1d_loop_restoration_dsp_init_x86)(c, bpc);
+ loop_restoration_dsp_init_x86(c, bpc);
#endif
#endif
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/mc.h b/chromium/third_party/dav1d/libdav1d/src/mc.h
index 784b58d2218..59ba2d9a5a0 100644
--- a/chromium/third_party/dav1d/libdav1d/src/mc.h
+++ b/chromium/third_party/dav1d/libdav1d/src/mc.h
@@ -132,7 +132,5 @@ typedef struct Dav1dMCDSPContext {
} Dav1dMCDSPContext;
bitfn_decls(void dav1d_mc_dsp_init, Dav1dMCDSPContext *c);
-bitfn_decls(void dav1d_mc_dsp_init_arm, Dav1dMCDSPContext *c);
-bitfn_decls(void dav1d_mc_dsp_init_x86, Dav1dMCDSPContext *c);
#endif /* DAV1D_SRC_MC_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/mc_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/mc_tmpl.c
index f8d3e3bda83..20226d8a398 100644
--- a/chromium/third_party/dav1d/libdav1d/src/mc_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/mc_tmpl.c
@@ -902,6 +902,14 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
} while (--h);
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/mc.h"
+#elif ARCH_X86
+#include "src/x86/mc.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
#define init_mc_fns(type, name) do { \
c->mc [type] = put_##name##_c; \
@@ -937,9 +945,9 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_mc_dsp_init_arm)(c);
+ mc_dsp_init_arm(c);
#elif ARCH_X86
- bitfn(dav1d_mc_dsp_init_x86)(c);
+ mc_dsp_init_x86(c);
#endif
#endif
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/meson.build b/chromium/third_party/dav1d/libdav1d/src/meson.build
index a9ce1594dc5..719015496ee 100644
--- a/chromium/third_party/dav1d/libdav1d/src/meson.build
+++ b/chromium/third_party/dav1d/libdav1d/src/meson.build
@@ -92,16 +92,6 @@ if is_asm_enabled
libdav1d_sources += files(
'arm/cpu.c',
- 'arm/refmvs_init.c',
- )
- libdav1d_tmpl_sources += files(
- 'arm/cdef_init_tmpl.c',
- 'arm/filmgrain_init_tmpl.c',
- 'arm/ipred_init_tmpl.c',
- 'arm/itx_init_tmpl.c',
- 'arm/loopfilter_init_tmpl.c',
- 'arm/looprestoration_init_tmpl.c',
- 'arm/mc_init_tmpl.c',
)
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu() == 'arm64')
@@ -177,18 +167,6 @@ if is_asm_enabled
libdav1d_sources += files(
'x86/cpu.c',
- 'x86/msac_init.c',
- 'x86/refmvs_init.c',
- )
-
- libdav1d_tmpl_sources += files(
- 'x86/cdef_init_tmpl.c',
- 'x86/filmgrain_init_tmpl.c',
- 'x86/ipred_init_tmpl.c',
- 'x86/itx_init_tmpl.c',
- 'x86/loopfilter_init_tmpl.c',
- 'x86/looprestoration_init_tmpl.c',
- 'x86/mc_init_tmpl.c',
)
# NASM source files
@@ -196,6 +174,7 @@ if is_asm_enabled
'x86/cpuid.asm',
'x86/msac.asm',
'x86/refmvs.asm',
+ 'x86/itx_avx512.asm',
'x86/cdef_avx2.asm',
'x86/itx_avx2.asm',
'x86/looprestoration_avx2.asm',
@@ -208,7 +187,6 @@ if is_asm_enabled
'x86/cdef_avx512.asm',
'x86/filmgrain_avx512.asm',
'x86/ipred_avx512.asm',
- 'x86/itx_avx512.asm',
'x86/loopfilter_avx512.asm',
'x86/looprestoration_avx512.asm',
'x86/mc_avx512.asm',
@@ -226,8 +204,11 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
+ 'x86/cdef16_avx512.asm',
'x86/filmgrain16_avx512.asm',
'x86/ipred16_avx512.asm',
+ 'x86/itx16_avx512.asm',
+ 'x86/loopfilter16_avx512.asm',
'x86/looprestoration16_avx512.asm',
'x86/mc16_avx512.asm',
'x86/cdef16_avx2.asm',
@@ -255,8 +236,8 @@ if is_asm_enabled
'ppc/cpu.c',
)
libdav1d_arch_tmpl_sources += files(
- 'ppc/cdef_init_tmpl.c',
- 'ppc/looprestoration_init_tmpl.c',
+ 'ppc/cdef_tmpl.c',
+ 'ppc/looprestoration_tmpl.c',
)
endif
endif
diff --git a/chromium/third_party/dav1d/libdav1d/src/msac.c b/chromium/third_party/dav1d/libdav1d/src/msac.c
index d5f3207bb0d..43d8ae5d07c 100644
--- a/chromium/third_party/dav1d/libdav1d/src/msac.c
+++ b/chromium/third_party/dav1d/libdav1d/src/msac.c
@@ -203,6 +203,6 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
#if ARCH_X86_64 && HAVE_ASM
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
- dav1d_msac_init_x86(s);
+ msac_init_x86(s);
#endif
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/obu.c b/chromium/third_party/dav1d/libdav1d/src/obu.c
index 7df6850a8c3..b6c2b6990bc 100644
--- a/chromium/third_party/dav1d/libdav1d/src/obu.c
+++ b/chromium/third_party/dav1d/libdav1d/src/obu.c
@@ -53,6 +53,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
const unsigned init_bit_pos = dav1d_get_bits_pos(gb);
#endif
+ memset(hdr, 0, sizeof(*hdr));
hdr->profile = dav1d_get_bits(gb, 3);
if (hdr->profile > 2) goto error;
#if DEBUG_SEQ_HDR
@@ -60,8 +61,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
- hdr->still_picture = dav1d_get_bits(gb, 1);
- hdr->reduced_still_picture_header = dav1d_get_bits(gb, 1);
+ hdr->still_picture = dav1d_get_bit(gb);
+ hdr->reduced_still_picture_header = dav1d_get_bit(gb);
if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error;
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-stillpicture_flags: off=%u\n",
@@ -69,22 +70,16 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
#endif
if (hdr->reduced_still_picture_header) {
- hdr->timing_info_present = 0;
- hdr->decoder_model_info_present = 0;
- hdr->display_model_info_present = 0;
hdr->num_operating_points = 1;
- hdr->operating_points[0].idc = 0;
hdr->operating_points[0].major_level = dav1d_get_bits(gb, 3);
hdr->operating_points[0].minor_level = dav1d_get_bits(gb, 2);
- hdr->operating_points[0].tier = 0;
- hdr->operating_points[0].decoder_model_param_present = 0;
- hdr->operating_points[0].display_model_param_present = 0;
+ hdr->operating_points[0].initial_display_delay = 10;
} else {
- hdr->timing_info_present = dav1d_get_bits(gb, 1);
+ hdr->timing_info_present = dav1d_get_bit(gb);
if (hdr->timing_info_present) {
hdr->num_units_in_tick = dav1d_get_bits(gb, 32);
hdr->time_scale = dav1d_get_bits(gb, 32);
- hdr->equal_picture_interval = dav1d_get_bits(gb, 1);
+ hdr->equal_picture_interval = dav1d_get_bit(gb);
if (hdr->equal_picture_interval) {
const unsigned num_ticks_per_picture = dav1d_get_vlc(gb);
if (num_ticks_per_picture == 0xFFFFFFFFU)
@@ -92,22 +87,20 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
hdr->num_ticks_per_picture = num_ticks_per_picture + 1;
}
- hdr->decoder_model_info_present = dav1d_get_bits(gb, 1);
+ hdr->decoder_model_info_present = dav1d_get_bit(gb);
if (hdr->decoder_model_info_present) {
hdr->encoder_decoder_buffer_delay_length = dav1d_get_bits(gb, 5) + 1;
hdr->num_units_in_decoding_tick = dav1d_get_bits(gb, 32);
hdr->buffer_removal_delay_length = dav1d_get_bits(gb, 5) + 1;
hdr->frame_presentation_delay_length = dav1d_get_bits(gb, 5) + 1;
}
- } else {
- hdr->decoder_model_info_present = 0;
}
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-timinginfo: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
- hdr->display_model_info_present = dav1d_get_bits(gb, 1);
+ hdr->display_model_info_present = dav1d_get_bit(gb);
hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1;
for (int i = 0; i < hdr->num_operating_points; i++) {
struct Dav1dSequenceHeaderOperatingPoint *const op =
@@ -117,23 +110,24 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
goto error;
op->major_level = 2 + dav1d_get_bits(gb, 3);
op->minor_level = dav1d_get_bits(gb, 2);
- op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0;
- op->decoder_model_param_present =
- hdr->decoder_model_info_present && dav1d_get_bits(gb, 1);
- if (op->decoder_model_param_present) {
- struct Dav1dSequenceHeaderOperatingParameterInfo *const opi =
- &hdr->operating_parameter_info[i];
- opi->decoder_buffer_delay =
- dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
- opi->encoder_buffer_delay =
- dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
- opi->low_delay_mode = dav1d_get_bits(gb, 1);
- }
- op->display_model_param_present =
- hdr->display_model_info_present && dav1d_get_bits(gb, 1);
- if (op->display_model_param_present) {
- op->initial_display_delay = dav1d_get_bits(gb, 4) + 1;
+ if (op->major_level > 3)
+ op->tier = dav1d_get_bit(gb);
+ if (hdr->decoder_model_info_present) {
+ op->decoder_model_param_present = dav1d_get_bit(gb);
+ if (op->decoder_model_param_present) {
+ struct Dav1dSequenceHeaderOperatingParameterInfo *const opi =
+ &hdr->operating_parameter_info[i];
+ opi->decoder_buffer_delay =
+ dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
+ opi->encoder_buffer_delay =
+ dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
+ opi->low_delay_mode = dav1d_get_bit(gb);
+ }
}
+ if (hdr->display_model_info_present)
+ op->display_model_param_present = dav1d_get_bit(gb);
+ op->initial_display_delay =
+ op->display_model_param_present ? dav1d_get_bits(gb, 4) + 1 : 10;
}
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-operating-points: off=%u\n",
@@ -155,67 +149,58 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
printf("SEQHDR: post-size: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
- hdr->frame_id_numbers_present =
- hdr->reduced_still_picture_header ? 0 : dav1d_get_bits(gb, 1);
- if (hdr->frame_id_numbers_present) {
- hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2;
- hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1;
+ if (!hdr->reduced_still_picture_header) {
+ hdr->frame_id_numbers_present = dav1d_get_bit(gb);
+ if (hdr->frame_id_numbers_present) {
+ hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2;
+ hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1;
+ }
}
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-frame-id-numbers-present: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
- hdr->sb128 = dav1d_get_bits(gb, 1);
- hdr->filter_intra = dav1d_get_bits(gb, 1);
- hdr->intra_edge_filter = dav1d_get_bits(gb, 1);
+ hdr->sb128 = dav1d_get_bit(gb);
+ hdr->filter_intra = dav1d_get_bit(gb);
+ hdr->intra_edge_filter = dav1d_get_bit(gb);
if (hdr->reduced_still_picture_header) {
- hdr->inter_intra = 0;
- hdr->masked_compound = 0;
- hdr->warped_motion = 0;
- hdr->dual_filter = 0;
- hdr->order_hint = 0;
- hdr->jnt_comp = 0;
- hdr->ref_frame_mvs = 0;
- hdr->order_hint_n_bits = 0;
hdr->screen_content_tools = DAV1D_ADAPTIVE;
hdr->force_integer_mv = DAV1D_ADAPTIVE;
} else {
- hdr->inter_intra = dav1d_get_bits(gb, 1);
- hdr->masked_compound = dav1d_get_bits(gb, 1);
- hdr->warped_motion = dav1d_get_bits(gb, 1);
- hdr->dual_filter = dav1d_get_bits(gb, 1);
- hdr->order_hint = dav1d_get_bits(gb, 1);
+ hdr->inter_intra = dav1d_get_bit(gb);
+ hdr->masked_compound = dav1d_get_bit(gb);
+ hdr->warped_motion = dav1d_get_bit(gb);
+ hdr->dual_filter = dav1d_get_bit(gb);
+ hdr->order_hint = dav1d_get_bit(gb);
if (hdr->order_hint) {
- hdr->jnt_comp = dav1d_get_bits(gb, 1);
- hdr->ref_frame_mvs = dav1d_get_bits(gb, 1);
- } else {
- hdr->jnt_comp = 0;
- hdr->ref_frame_mvs = 0;
- hdr->order_hint_n_bits = 0;
+ hdr->jnt_comp = dav1d_get_bit(gb);
+ hdr->ref_frame_mvs = dav1d_get_bit(gb);
}
- hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1);
+ hdr->screen_content_tools = dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb);
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-screentools: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
hdr->force_integer_mv = hdr->screen_content_tools ?
- dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1) : 2;
+ dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb) : 2;
if (hdr->order_hint)
hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1;
}
- hdr->super_res = dav1d_get_bits(gb, 1);
- hdr->cdef = dav1d_get_bits(gb, 1);
- hdr->restoration = dav1d_get_bits(gb, 1);
+ hdr->super_res = dav1d_get_bit(gb);
+ hdr->cdef = dav1d_get_bit(gb);
+ hdr->restoration = dav1d_get_bit(gb);
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-featurebits: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
- hdr->hbd = dav1d_get_bits(gb, 1);
- if (hdr->profile == 2 && hdr->hbd) hdr->hbd += dav1d_get_bits(gb, 1);
- hdr->monochrome = hdr->profile != 1 ? dav1d_get_bits(gb, 1) : 0;
- hdr->color_description_present = dav1d_get_bits(gb, 1);
+ hdr->hbd = dav1d_get_bit(gb);
+ if (hdr->profile == 2 && hdr->hbd)
+ hdr->hbd += dav1d_get_bit(gb);
+ if (hdr->profile != 1)
+ hdr->monochrome = dav1d_get_bit(gb);
+ hdr->color_description_present = dav1d_get_bit(gb);
if (hdr->color_description_present) {
hdr->pri = dav1d_get_bits(gb, 8);
hdr->trc = dav1d_get_bits(gb, 8);
@@ -226,44 +211,40 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
hdr->mtrx = DAV1D_MC_UNKNOWN;
}
if (hdr->monochrome) {
- hdr->color_range = dav1d_get_bits(gb, 1);
+ hdr->color_range = dav1d_get_bit(gb);
hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
hdr->ss_hor = hdr->ss_ver = 1;
hdr->chr = DAV1D_CHR_UNKNOWN;
- hdr->separate_uv_delta_q = 0;
} else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
hdr->trc == DAV1D_TRC_SRGB &&
hdr->mtrx == DAV1D_MC_IDENTITY)
{
hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
- hdr->ss_hor = hdr->ss_ver = 0;
hdr->color_range = 1;
if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2))
goto error;
} else {
- hdr->color_range = dav1d_get_bits(gb, 1);
+ hdr->color_range = dav1d_get_bit(gb);
switch (hdr->profile) {
case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420;
hdr->ss_hor = hdr->ss_ver = 1;
break;
case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
- hdr->ss_hor = hdr->ss_ver = 0;
break;
case 2:
if (hdr->hbd == 2) {
- hdr->ss_hor = dav1d_get_bits(gb, 1);
- hdr->ss_ver = hdr->ss_hor && dav1d_get_bits(gb, 1);
- } else {
+ hdr->ss_hor = dav1d_get_bit(gb);
+ if (hdr->ss_hor)
+ hdr->ss_ver = dav1d_get_bit(gb);
+ } else
hdr->ss_hor = 1;
- hdr->ss_ver = 0;
- }
hdr->layout = hdr->ss_hor ?
hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 :
DAV1D_PIXEL_LAYOUT_I422 :
DAV1D_PIXEL_LAYOUT_I444;
break;
}
- hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ?
+ hdr->chr = (hdr->ss_hor & hdr->ss_ver) ?
dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
}
if (c->strict_std_compliance &&
@@ -271,19 +252,20 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
{
goto error;
}
- hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1);
+ if (!hdr->monochrome)
+ hdr->separate_uv_delta_q = dav1d_get_bit(gb);
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-colorinfo: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
- hdr->film_grain_present = dav1d_get_bits(gb, 1);
+ hdr->film_grain_present = dav1d_get_bit(gb);
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-filmgrain: off=%u\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
#endif
- dav1d_get_bits(gb, 1); // dummy bit
+ dav1d_get_bit(gb); // dummy bit
// We needn't bother flushing the OBU here: we'll check we didn't
// overrun in the caller and will then discard gb, so there's no
@@ -304,15 +286,15 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
if (use_ref) {
for (int i = 0; i < 7; i++) {
- if (dav1d_get_bits(gb, 1)) {
+ if (dav1d_get_bit(gb)) {
const Dav1dThreadPicture *const ref =
&c->refs[c->frame_hdr->refidx[i]].p;
- if (!ref->p.data[0]) return -1;
- hdr->width[1] = ref->p.p.w;
- hdr->height = ref->p.p.h;
+ if (!ref->p.frame_hdr) return -1;
+ hdr->width[1] = ref->p.frame_hdr->width[1];
+ hdr->height = ref->p.frame_hdr->height;
hdr->render_width = ref->p.frame_hdr->render_width;
hdr->render_height = ref->p.frame_hdr->render_height;
- hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1);
+ hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb);
if (hdr->super_res.enabled) {
const int d = hdr->super_res.width_scale_denominator =
9 + dav1d_get_bits(gb, 3);
@@ -334,7 +316,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
hdr->width[1] = seqhdr->max_width;
hdr->height = seqhdr->max_height;
}
- hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1);
+ hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb);
if (hdr->super_res.enabled) {
const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3);
hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, imin(16, hdr->width[1]));
@@ -342,7 +324,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
hdr->super_res.width_scale_denominator = 8;
hdr->width[0] = hdr->width[1];
}
- hdr->have_render_size = dav1d_get_bits(gb, 1);
+ hdr->have_render_size = dav1d_get_bit(gb);
if (hdr->have_render_size) {
hdr->render_width = dav1d_get_bits(gb, 16) + 1;
hdr->render_height = dav1d_get_bits(gb, 16) + 1;
@@ -374,7 +356,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
Dav1dFrameHeader *const hdr = c->frame_hdr;
hdr->show_existing_frame =
- !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1);
+ !seqhdr->reduced_still_picture_header && dav1d_get_bit(gb);
#if DEBUG_FRAME_HDR
printf("HDR: post-show_existing_frame: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
@@ -392,26 +374,27 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
}
hdr->frame_type = seqhdr->reduced_still_picture_header ? DAV1D_FRAME_TYPE_KEY : dav1d_get_bits(gb, 2);
- hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1);
+ hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bit(gb);
if (hdr->show_frame) {
if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
+ hdr->showable_frame = hdr->frame_type != DAV1D_FRAME_TYPE_KEY;
} else
- hdr->showable_frame = dav1d_get_bits(gb, 1);
+ hdr->showable_frame = dav1d_get_bit(gb);
hdr->error_resilient_mode =
(hdr->frame_type == DAV1D_FRAME_TYPE_KEY && hdr->show_frame) ||
hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ||
- seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1);
+ seqhdr->reduced_still_picture_header || dav1d_get_bit(gb);
#if DEBUG_FRAME_HDR
printf("HDR: post-frametype_bits: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
- hdr->disable_cdf_update = dav1d_get_bits(gb, 1);
+ hdr->disable_cdf_update = dav1d_get_bit(gb);
hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV1D_ADAPTIVE ?
- dav1d_get_bits(gb, 1) : seqhdr->screen_content_tools;
+ dav1d_get_bit(gb) : seqhdr->screen_content_tools;
if (hdr->allow_screen_content_tools)
hdr->force_integer_mv = seqhdr->force_integer_mv == DAV1D_ADAPTIVE ?
- dav1d_get_bits(gb, 1) : seqhdr->force_integer_mv;
+ dav1d_get_bit(gb) : seqhdr->force_integer_mv;
else
hdr->force_integer_mv = 0;
@@ -422,7 +405,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 :
- hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bits(gb, 1);
+ hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bit(gb);
#if DEBUG_FRAME_HDR
printf("HDR: post-frame_size_override_flag: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
@@ -433,7 +416,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE;
if (seqhdr->decoder_model_info_present) {
- hdr->buffer_removal_time_present = dav1d_get_bits(gb, 1);
+ hdr->buffer_removal_time_present = dav1d_get_bit(gb);
if (hdr->buffer_removal_time_present) {
for (int i = 0; i < c->seq_hdr->num_operating_points; i++) {
const struct Dav1dSequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i];
@@ -454,9 +437,14 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint)
for (int i = 0; i < 8; i++)
dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
+ if (c->strict_std_compliance &&
+ hdr->frame_type == DAV1D_FRAME_TYPE_INTRA && hdr->refresh_frame_flags == 0xff)
+ {
+ goto error;
+ }
if (read_frame_size(c, gb, 0) < 0) goto error;
hdr->allow_intrabc = hdr->allow_screen_content_tools &&
- !hdr->super_res.enabled && dav1d_get_bits(gb, 1);
+ !hdr->super_res.enabled && dav1d_get_bit(gb);
hdr->use_ref_frame_mvs = 0;
} else {
hdr->allow_intrabc = 0;
@@ -466,7 +454,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
for (int i = 0; i < 8; i++)
dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
hdr->frame_ref_short_signaling =
- seqhdr->order_hint && dav1d_get_bits(gb, 1);
+ seqhdr->order_hint && dav1d_get_bit(gb);
if (hdr->frame_ref_short_signaling) { // FIXME: Nearly verbatim copy from section 7.8
hdr->refidx[0] = dav1d_get_bits(gb, 3);
hdr->refidx[1] = hdr->refidx[2] = -1;
@@ -570,13 +558,13 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
const int use_ref = !hdr->error_resilient_mode &&
hdr->frame_size_override;
if (read_frame_size(c, gb, use_ref) < 0) goto error;
- hdr->hp = !hdr->force_integer_mv && dav1d_get_bits(gb, 1);
- hdr->subpel_filter_mode = dav1d_get_bits(gb, 1) ? DAV1D_FILTER_SWITCHABLE :
+ hdr->hp = !hdr->force_integer_mv && dav1d_get_bit(gb);
+ hdr->subpel_filter_mode = dav1d_get_bit(gb) ? DAV1D_FILTER_SWITCHABLE :
dav1d_get_bits(gb, 2);
- hdr->switchable_motion_mode = dav1d_get_bits(gb, 1);
+ hdr->switchable_motion_mode = dav1d_get_bit(gb);
hdr->use_ref_frame_mvs = !hdr->error_resilient_mode &&
seqhdr->ref_frame_mvs && seqhdr->order_hint &&
- IS_INTER_OR_SWITCH(hdr) && dav1d_get_bits(gb, 1);
+ IS_INTER_OR_SWITCH(hdr) && dav1d_get_bit(gb);
}
#if DEBUG_FRAME_HDR
printf("HDR: post-frametype-specific-bits: off=%td\n",
@@ -584,14 +572,14 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
#endif
hdr->refresh_context = !seqhdr->reduced_still_picture_header &&
- !hdr->disable_cdf_update && !dav1d_get_bits(gb, 1);
+ !hdr->disable_cdf_update && !dav1d_get_bit(gb);
#if DEBUG_FRAME_HDR
printf("HDR: post-refresh_context: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
// tile data
- hdr->tiling.uniform = dav1d_get_bits(gb, 1);
+ hdr->tiling.uniform = dav1d_get_bit(gb);
const int sbsz_min1 = (64 << seqhdr->sb128) - 1;
const int sbsz_log2 = 6 + seqhdr->sb128;
const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2;
@@ -605,7 +593,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
hdr->tiling.min_log2_cols);
if (hdr->tiling.uniform) {
for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols;
- hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bits(gb, 1);
+ hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bit(gb);
hdr->tiling.log2_cols++) ;
const int tile_w = 1 + ((sbw - 1) >> hdr->tiling.log2_cols);
hdr->tiling.cols = 0;
@@ -615,7 +603,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
imax(min_log2_tiles - hdr->tiling.log2_cols, 0);
for (hdr->tiling.log2_rows = hdr->tiling.min_log2_rows;
- hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bits(gb, 1);
+ hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bit(gb);
hdr->tiling.log2_rows++) ;
const int tile_h = 1 + ((sbh - 1) >> hdr->tiling.log2_rows);
hdr->tiling.rows = 0;
@@ -666,17 +654,17 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
// quant data
hdr->quant.yac = dav1d_get_bits(gb, 8);
- hdr->quant.ydc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
+ hdr->quant.ydc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
if (!seqhdr->monochrome) {
// If the sequence header says that delta_q might be different
// for U, V, we must check whether it actually is for this
// frame.
- const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0;
- hdr->quant.udc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
- hdr->quant.uac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
+ const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bit(gb) : 0;
+ hdr->quant.udc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ hdr->quant.uac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
if (diff_uv_delta) {
- hdr->quant.vdc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
- hdr->quant.vac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
+ hdr->quant.vdc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ hdr->quant.vac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
} else {
hdr->quant.vdc_delta = hdr->quant.udc_delta;
hdr->quant.vac_delta = hdr->quant.uac_delta;
@@ -686,7 +674,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
printf("HDR: post-quant: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
- hdr->quant.qm = dav1d_get_bits(gb, 1);
+ hdr->quant.qm = dav1d_get_bit(gb);
if (hdr->quant.qm) {
hdr->quant.qm_y = dav1d_get_bits(gb, 4);
hdr->quant.qm_u = dav1d_get_bits(gb, 4);
@@ -700,17 +688,17 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
#endif
// segmentation data
- hdr->segmentation.enabled = dav1d_get_bits(gb, 1);
+ hdr->segmentation.enabled = dav1d_get_bit(gb);
if (hdr->segmentation.enabled) {
if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
hdr->segmentation.update_map = 1;
hdr->segmentation.temporal = 0;
hdr->segmentation.update_data = 1;
} else {
- hdr->segmentation.update_map = dav1d_get_bits(gb, 1);
+ hdr->segmentation.update_map = dav1d_get_bit(gb);
hdr->segmentation.temporal =
- hdr->segmentation.update_map ? dav1d_get_bits(gb, 1) : 0;
- hdr->segmentation.update_data = dav1d_get_bits(gb, 1);
+ hdr->segmentation.update_map ? dav1d_get_bit(gb) : 0;
+ hdr->segmentation.update_data = dav1d_get_bit(gb);
}
if (hdr->segmentation.update_data) {
@@ -719,48 +707,48 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
Dav1dSegmentationData *const seg =
&hdr->segmentation.seg_data.d[i];
- if (dav1d_get_bits(gb, 1)) {
- seg->delta_q = dav1d_get_sbits(gb, 8);
+ if (dav1d_get_bit(gb)) {
+ seg->delta_q = dav1d_get_sbits(gb, 9);
hdr->segmentation.seg_data.last_active_segid = i;
} else {
seg->delta_q = 0;
}
- if (dav1d_get_bits(gb, 1)) {
- seg->delta_lf_y_v = dav1d_get_sbits(gb, 6);
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_y_v = dav1d_get_sbits(gb, 7);
hdr->segmentation.seg_data.last_active_segid = i;
} else {
seg->delta_lf_y_v = 0;
}
- if (dav1d_get_bits(gb, 1)) {
- seg->delta_lf_y_h = dav1d_get_sbits(gb, 6);
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_y_h = dav1d_get_sbits(gb, 7);
hdr->segmentation.seg_data.last_active_segid = i;
} else {
seg->delta_lf_y_h = 0;
}
- if (dav1d_get_bits(gb, 1)) {
- seg->delta_lf_u = dav1d_get_sbits(gb, 6);
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_u = dav1d_get_sbits(gb, 7);
hdr->segmentation.seg_data.last_active_segid = i;
} else {
seg->delta_lf_u = 0;
}
- if (dav1d_get_bits(gb, 1)) {
- seg->delta_lf_v = dav1d_get_sbits(gb, 6);
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_v = dav1d_get_sbits(gb, 7);
hdr->segmentation.seg_data.last_active_segid = i;
} else {
seg->delta_lf_v = 0;
}
- if (dav1d_get_bits(gb, 1)) {
+ if (dav1d_get_bit(gb)) {
seg->ref = dav1d_get_bits(gb, 3);
hdr->segmentation.seg_data.last_active_segid = i;
hdr->segmentation.seg_data.preskip = 1;
} else {
seg->ref = -1;
}
- if ((seg->skip = dav1d_get_bits(gb, 1))) {
+ if ((seg->skip = dav1d_get_bit(gb))) {
hdr->segmentation.seg_data.last_active_segid = i;
hdr->segmentation.seg_data.preskip = 1;
}
- if ((seg->globalmv = dav1d_get_bits(gb, 1))) {
+ if ((seg->globalmv = dav1d_get_bit(gb))) {
hdr->segmentation.seg_data.last_active_segid = i;
hdr->segmentation.seg_data.preskip = 1;
}
@@ -785,12 +773,12 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
#endif
// delta q
- hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bits(gb, 1) : 0;
+ hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bit(gb) : 0;
hdr->delta.q.res_log2 = hdr->delta.q.present ? dav1d_get_bits(gb, 2) : 0;
hdr->delta.lf.present = hdr->delta.q.present && !hdr->allow_intrabc &&
- dav1d_get_bits(gb, 1);
+ dav1d_get_bit(gb);
hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0;
- hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bits(gb, 1) : 0;
+ hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bit(gb) : 0;
#if DEBUG_FRAME_HDR
printf("HDR: post-delta_q_lf_flags: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
@@ -836,18 +824,18 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
hdr->loopfilter.mode_ref_deltas =
c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas;
}
- hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bits(gb, 1);
+ hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bit(gb);
if (hdr->loopfilter.mode_ref_delta_enabled) {
- hdr->loopfilter.mode_ref_delta_update = dav1d_get_bits(gb, 1);
+ hdr->loopfilter.mode_ref_delta_update = dav1d_get_bit(gb);
if (hdr->loopfilter.mode_ref_delta_update) {
for (int i = 0; i < 8; i++)
- if (dav1d_get_bits(gb, 1))
+ if (dav1d_get_bit(gb))
hdr->loopfilter.mode_ref_deltas.ref_delta[i] =
- dav1d_get_sbits(gb, 6);
+ dav1d_get_sbits(gb, 7);
for (int i = 0; i < 2; i++)
- if (dav1d_get_bits(gb, 1))
+ if (dav1d_get_bit(gb))
hdr->loopfilter.mode_ref_deltas.mode_delta[i] =
- dav1d_get_sbits(gb, 6);
+ dav1d_get_sbits(gb, 7);
}
}
}
@@ -893,16 +881,16 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
{
// Log2 of the restoration unit size.
hdr->restoration.unit_size[0] = 6 + seqhdr->sb128;
- if (dav1d_get_bits(gb, 1)) {
+ if (dav1d_get_bit(gb)) {
hdr->restoration.unit_size[0]++;
if (!seqhdr->sb128)
- hdr->restoration.unit_size[0] += dav1d_get_bits(gb, 1);
+ hdr->restoration.unit_size[0] += dav1d_get_bit(gb);
}
hdr->restoration.unit_size[1] = hdr->restoration.unit_size[0];
if ((hdr->restoration.type[1] || hdr->restoration.type[2]) &&
seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1)
{
- hdr->restoration.unit_size[1] -= dav1d_get_bits(gb, 1);
+ hdr->restoration.unit_size[1] -= dav1d_get_bit(gb);
}
} else {
hdr->restoration.unit_size[0] = 8;
@@ -918,12 +906,12 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
#endif
hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY :
- dav1d_get_bits(gb, 1) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST;
+ dav1d_get_bit(gb) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST;
#if DEBUG_FRAME_HDR
printf("HDR: post-txfmmode: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
- hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bits(gb, 1) : 0;
+ hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bit(gb) : 0;
#if DEBUG_FRAME_HDR
printf("HDR: post-refmode: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
@@ -935,7 +923,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
int off_after = -1;
int off_before_idx, off_after_idx;
for (int i = 0; i < 7; i++) {
- if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error;
+ if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error;
const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc);
@@ -963,7 +951,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
unsigned off_before2 = 0xFFFFFFFFU;
int off_before2_idx;
for (int i = 0; i < 7; i++) {
- if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error;
+ if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error;
const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
if (get_poc_diff(seqhdr->order_hint_n_bits,
refpoc, off_before) < 0) {
@@ -984,18 +972,18 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
}
}
}
- hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0;
+ hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bit(gb) : 0;
#if DEBUG_FRAME_HDR
printf("HDR: post-extskip: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
hdr->warp_motion = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) &&
- seqhdr->warped_motion && dav1d_get_bits(gb, 1);
+ seqhdr->warped_motion && dav1d_get_bit(gb);
#if DEBUG_FRAME_HDR
printf("HDR: post-warpmotionbit: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
#endif
- hdr->reduced_txtp_set = dav1d_get_bits(gb, 1);
+ hdr->reduced_txtp_set = dav1d_get_bit(gb);
#if DEBUG_FRAME_HDR
printf("HDR: post-reducedtxtpset: off=%td\n",
(gb->ptr - init_ptr) * 8 - gb->bits_left);
@@ -1006,9 +994,9 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
if (IS_INTER_OR_SWITCH(hdr)) {
for (int i = 0; i < 7; i++) {
- hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY :
- dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM :
- dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_TRANSLATION :
+ hdr->gmv[i].type = !dav1d_get_bit(gb) ? DAV1D_WM_TYPE_IDENTITY :
+ dav1d_get_bit(gb) ? DAV1D_WM_TYPE_ROT_ZOOM :
+ dav1d_get_bit(gb) ? DAV1D_WM_TYPE_TRANSLATION :
DAV1D_WM_TYPE_AFFINE;
if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue;
@@ -1057,10 +1045,10 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
hdr->film_grain.present = seqhdr->film_grain_present &&
(hdr->show_frame || hdr->showable_frame) &&
- dav1d_get_bits(gb, 1);
+ dav1d_get_bit(gb);
if (hdr->film_grain.present) {
const unsigned seed = dav1d_get_bits(gb, 16);
- hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bits(gb, 1);
+ hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bit(gb);
if (!hdr->film_grain.update) {
const int refidx = dav1d_get_bits(gb, 3);
int i;
@@ -1084,7 +1072,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
}
fgd->chroma_scaling_from_luma =
- !seqhdr->monochrome && dav1d_get_bits(gb, 1);
+ !seqhdr->monochrome && dav1d_get_bit(gb);
if (seqhdr->monochrome || fgd->chroma_scaling_from_luma ||
(seqhdr->ss_ver == 1 && seqhdr->ss_hor == 1 && !fgd->num_y_points))
{
@@ -1128,8 +1116,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8) - 128;
fgd->uv_offset[pl] = dav1d_get_bits(gb, 9) - 256;
}
- fgd->overlap_flag = dav1d_get_bits(gb, 1);
- fgd->clip_to_restricted_range = dav1d_get_bits(gb, 1);
+ fgd->overlap_flag = dav1d_get_bit(gb);
+ fgd->clip_to_restricted_range = dav1d_get_bit(gb);
}
} else {
memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data));
@@ -1148,7 +1136,7 @@ error:
static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows;
- const int have_tile_pos = n_tiles > 1 ? dav1d_get_bits(gb, 1) : 0;
+ const int have_tile_pos = n_tiles > 1 ? dav1d_get_bit(gb) : 0;
if (have_tile_pos) {
const int n_bits = c->frame_hdr->tiling.log2_cols +
@@ -1194,11 +1182,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
dav1d_init_get_bits(&gb, in->data, in->sz);
// obu header
- dav1d_get_bits(&gb, 1); // obu_forbidden_bit
+ dav1d_get_bit(&gb); // obu_forbidden_bit
const enum Dav1dObuType type = dav1d_get_bits(&gb, 4);
- const int has_extension = dav1d_get_bits(&gb, 1);
- const int has_length_field = dav1d_get_bits(&gb, 1);
- dav1d_get_bits(&gb, 1); // reserved
+ const int has_extension = dav1d_get_bit(&gb);
+ const int has_length_field = dav1d_get_bit(&gb);
+ dav1d_get_bit(&gb); // reserved
int temporal_id = 0, spatial_id = 0;
if (has_extension) {
@@ -1245,7 +1233,6 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
sizeof(Dav1dSequenceHeader));
if (!ref) return DAV1D_ERR(ENOMEM);
Dav1dSequenceHeader *seq_hdr = ref->data;
- memset(seq_hdr, 0, sizeof(*seq_hdr));
if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) {
dav1d_ref_dec(&ref);
goto error;
@@ -1270,7 +1257,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
dav1d_ref_dec(&c->mastering_display_ref);
dav1d_ref_dec(&c->content_light_ref);
for (int i = 0; i < 8; i++) {
- if (c->refs[i].p.p.data[0])
+ if (c->refs[i].p.p.frame_hdr)
dav1d_thread_picture_unref(&c->refs[i].p);
dav1d_ref_dec(&c->refs[i].segmap);
dav1d_ref_dec(&c->refs[i].refmvs);
@@ -1319,7 +1306,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if (type != DAV1D_OBU_FRAME) {
// This is actually a frame header OBU so read the
// trailing bit and check for overrun.
- dav1d_get_bits(&gb, 1);
+ dav1d_get_bit(&gb);
if (check_for_overrun(c, &gb, init_bit_pos, len)) {
c->frame_hdr = NULL;
goto error;
@@ -1419,7 +1406,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
#endif
// Skip the trailing bit, align to the next byte boundary and check for overrun.
- dav1d_get_bits(&gb, 1);
+ dav1d_get_bit(&gb);
dav1d_bytealign_get_bits(&gb);
if (check_for_overrun(c, &gb, init_bit_pos, len)) {
dav1d_ref_dec(&ref);
@@ -1471,7 +1458,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
(gb.ptr - init_ptr) * 8 - gb.bits_left);
#endif
// Skip the trailing bit, align to the next byte boundary and check for overrun.
- dav1d_get_bits(&gb, 1);
+ dav1d_get_bit(&gb);
dav1d_bytealign_get_bits(&gb);
if (check_for_overrun(c, &gb, init_bit_pos, len)) {
dav1d_ref_dec(&ref);
@@ -1503,7 +1490,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if (payload_size <= 0) {
dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
- goto error;
+ break;
}
Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
@@ -1550,7 +1537,26 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if (c->seq_hdr && c->frame_hdr) {
if (c->frame_hdr->show_existing_frame) {
+ if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr) goto error;
+ switch (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type) {
+ case DAV1D_FRAME_TYPE_INTER:
+ case DAV1D_FRAME_TYPE_SWITCH:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE)
+ goto skip;
+ break;
+ case DAV1D_FRAME_TYPE_INTRA:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA)
+ goto skip;
+ // fall-through
+ default:
+ break;
+ }
if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error;
+ if (c->strict_std_compliance &&
+ !c->refs[c->frame_hdr->existing_frame_idx].p.showable)
+ {
+ goto error;
+ }
if (c->n_fc == 1) {
dav1d_thread_picture_ref(&c->out,
&c->refs[c->frame_hdr->existing_frame_idx].p);
@@ -1570,10 +1576,13 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
Dav1dThreadPicture *const out_delayed =
&c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
- if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
+ unsigned first = atomic_load(&c->task_thread.first);
+ if (first + 1U < c->n_fc)
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
+ atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+ &first, UINT_MAX);
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
@@ -1602,10 +1611,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
}
if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
const int r = c->frame_hdr->existing_frame_idx;
+ c->refs[r].p.showable = 0;
for (int i = 0; i < 8; i++) {
if (i == r) continue;
- if (c->refs[i].p.p.data[0])
+ if (c->refs[i].p.p.frame_hdr)
dav1d_thread_picture_unref(&c->refs[i].p);
dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p);
@@ -1621,6 +1631,23 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
}
c->frame_hdr = NULL;
} else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) {
+ switch (c->frame_hdr->frame_type) {
+ case DAV1D_FRAME_TYPE_INTER:
+ case DAV1D_FRAME_TYPE_SWITCH:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE ||
+ (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE &&
+ !c->frame_hdr->refresh_frame_flags))
+ goto skip;
+ break;
+ case DAV1D_FRAME_TYPE_INTRA:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA ||
+ (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE &&
+ !c->frame_hdr->refresh_frame_flags))
+ goto skip;
+ // fall-through
+ default:
+ break;
+ }
if (!c->n_tile_data)
goto error;
if ((res = dav1d_submit_frame(c)) < 0)
@@ -1633,6 +1660,26 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
return len + init_byte_pos;
+skip:
+ // update refs with only the headers in case we skip the frame
+ for (int i = 0; i < 8; i++) {
+ if (c->frame_hdr->refresh_frame_flags & (1 << i)) {
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ c->refs[i].p.p.frame_hdr = c->frame_hdr;
+ c->refs[i].p.p.seq_hdr = c->seq_hdr;
+ c->refs[i].p.p.frame_hdr_ref = c->frame_hdr_ref;
+ c->refs[i].p.p.seq_hdr_ref = c->seq_hdr_ref;
+ dav1d_ref_inc(c->frame_hdr_ref);
+ dav1d_ref_inc(c->seq_hdr_ref);
+ }
+ }
+
+ dav1d_ref_dec(&c->frame_hdr_ref);
+ c->frame_hdr = NULL;
+ c->n_tiles = 0;
+
+ return len + init_byte_pos;
+
error:
dav1d_data_props_copy(&c->cached_error_props, &in->m);
dav1d_log(c, "Error parsing OBU data\n");
diff --git a/chromium/third_party/dav1d/libdav1d/src/picture.c b/chromium/third_party/dav1d/libdav1d/src/picture.c
index bebc4dd9c17..58ebd824d68 100644
--- a/chromium/third_party/dav1d/libdav1d/src/picture.c
+++ b/chromium/third_party/dav1d/libdav1d/src/picture.c
@@ -194,10 +194,15 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
dav1d_ref_dec(&c->itut_t35_ref);
c->itut_t35 = NULL;
+ // Don't clear these flags from c->frame_flags if the frame is not visible.
+ // This way they will be added to the next visible frame too.
+ const int flags_mask = (f->frame_hdr->show_frame || c->output_invisible_frames)
+ ? 0 : (PICTURE_FLAG_NEW_SEQUENCE | PICTURE_FLAG_NEW_OP_PARAMS_INFO);
p->flags = c->frame_flags;
- c->frame_flags = 0;
+ c->frame_flags &= flags_mask;
p->visible = f->frame_hdr->show_frame;
+ p->showable = f->frame_hdr->showable_frame;
if (have_frame_mt) {
atomic_init(&p->progress[0], 0);
atomic_init(&p->progress[1], 0);
@@ -228,13 +233,13 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
if (src->ref) {
validate_input(src->data[0] != NULL);
dav1d_ref_inc(src->ref);
- if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
- if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
- if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
- if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref);
- if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref);
- if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref);
}
+ if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
+ if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
+ if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
+ if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref);
+ if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref);
+ if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref);
*dst = *src;
}
@@ -255,6 +260,7 @@ void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
{
dav1d_picture_ref(&dst->p, &src->p);
dst->visible = src->visible;
+ dst->showable = src->showable;
dst->progress = src->progress;
dst->flags = src->flags;
}
@@ -264,6 +270,7 @@ void dav1d_thread_picture_move_ref(Dav1dThreadPicture *const dst,
{
dav1d_picture_move_ref(&dst->p, &src->p);
dst->visible = src->visible;
+ dst->showable = src->showable;
dst->progress = src->progress;
dst->flags = src->flags;
memset(src, 0, sizeof(*src));
@@ -275,13 +282,13 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) {
if (p->ref) {
validate_input(p->data[0] != NULL);
dav1d_ref_dec(&p->ref);
- dav1d_ref_dec(&p->seq_hdr_ref);
- dav1d_ref_dec(&p->frame_hdr_ref);
- dav1d_ref_dec(&p->m.user_data.ref);
- dav1d_ref_dec(&p->content_light_ref);
- dav1d_ref_dec(&p->mastering_display_ref);
- dav1d_ref_dec(&p->itut_t35_ref);
}
+ dav1d_ref_dec(&p->seq_hdr_ref);
+ dav1d_ref_dec(&p->frame_hdr_ref);
+ dav1d_ref_dec(&p->m.user_data.ref);
+ dav1d_ref_dec(&p->content_light_ref);
+ dav1d_ref_dec(&p->mastering_display_ref);
+ dav1d_ref_dec(&p->itut_t35_ref);
memset(p, 0, sizeof(*p));
dav1d_data_props_set_defaults(&p->m);
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/picture.h b/chromium/third_party/dav1d/libdav1d/src/picture.h
index 0e30d48eb86..154c85a0c6a 100644
--- a/chromium/third_party/dav1d/libdav1d/src/picture.h
+++ b/chromium/third_party/dav1d/libdav1d/src/picture.h
@@ -52,6 +52,10 @@ enum PictureFlags {
typedef struct Dav1dThreadPicture {
Dav1dPicture p;
int visible;
+ // This can be set for inter frames, non-key intra frames, or for invisible
+ // keyframes that have not yet been made visible using the show-existing-frame
+ // mechanism.
+ int showable;
enum PictureFlags flags;
// [0] block data (including segmentation map and motion vectors)
// [1] pixel data
diff --git a/chromium/third_party/dav1d/libdav1d/src/ppc/cdef.h b/chromium/third_party/dav1d/libdav1d/src/ppc/cdef.h
new file mode 100644
index 00000000000..b794ba53bef
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/ppc/cdef.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+
+#include "common/bitdepth.h"
+#include "common/intops.h"
+
+#include "src/cdef.h"
+#include "src/cpu.h"
+
+#define cdef_vsx_fn(w, h) \
+void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges)
+
+cdef_vsx_fn(4, 4);
+cdef_vsx_fn(4, 8);
+cdef_vsx_fn(8, 8);
+
+static ALWAYS_INLINE void cdef_dsp_init_ppc(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+ c->fb[0] = dav1d_cdef_filter_8x8_vsx;
+ c->fb[1] = dav1d_cdef_filter_4x8_vsx;
+ c->fb[2] = dav1d_cdef_filter_4x4_vsx;
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/ppc/cdef_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/ppc/cdef_tmpl.c
new file mode 100644
index 00000000000..e2e759810f7
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/ppc/cdef_tmpl.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/ppc/dav1d_types.h"
+#include "src/ppc/cdef.h"
+
+#if BITDEPTH == 8
+static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
+ const int damping)
+{
+ const i16x8 zero = vec_splat_s16(0);
+ if (!threshold) return zero;
+ const uint16_t shift = imax(0, damping - ulog2(threshold));
+ const i16x8 abs_diff = vec_abs(diff);
+ const b16x8 mask = vec_cmplt(diff, zero);
+ const i16x8 thr = vec_splats(threshold);
+ const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift)));
+ const i16x8 max = vec_max(zero, sub);
+ const i16x8 min = vec_min(abs_diff, max);
+ const i16x8 neg = vec_sub(zero, min);
+ return vec_sel(min, neg, mask);
+}
+
+static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+ const uint8_t *src, const ptrdiff_t src_stride,
+ const uint8_t (*left)[2], const uint8_t *const top,
+ const uint8_t *const bottom, const int w, const int h,
+ const enum CdefEdgeFlags edges)
+{
+ const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+ u16x8 l0;
+ u16x8 l1;
+
+ int y_start = -2, y_end = h + 2;
+
+ // Copy top and bottom first
+ if (!(edges & CDEF_HAVE_TOP)) {
+ l0 = fill;
+ l1 = fill;
+ y_start = 0;
+ } else {
+ l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
+ l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
+ }
+
+ vec_st(l0, 0, tmp - 2 * 8);
+ vec_st(l1, 0, tmp - 1 * 8);
+
+ if (!(edges & CDEF_HAVE_BOTTOM)) {
+ l0 = fill;
+ l1 = fill;
+ y_end -= 2;
+ } else {
+ l0 = u8h_to_u16(vec_vsx_ld(0, bottom + 0 * src_stride - 2));
+ l1 = u8h_to_u16(vec_vsx_ld(0, bottom + 1 * src_stride - 2));
+ }
+
+ vec_st(l0, 0, tmp + (h + 0) * 8);
+ vec_st(l1, 0, tmp + (h + 1) * 8);
+
+ int y_with_left_edge = 0;
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ u16x8 l = u8h_to_u16(vec_vsx_ld(0, src));
+ vec_vsx_st(l, 0, tmp + 2);
+
+ y_with_left_edge = 1;
+ }
+
+ for (int y = y_with_left_edge; y < h; y++) {
+ u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride));
+ vec_st(l, 0, tmp + y * 8);
+ }
+
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[y * 8] = INT16_MAX;
+ tmp[1 + y * 8] = INT16_MAX;
+ }
+ } else {
+ for (int y = 0; y < h; y++) {
+ tmp[y * 8] = left[y][0];
+ tmp[1 + y * 8] = left[y][1];
+ }
+ }
+ if (!(edges & CDEF_HAVE_RIGHT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[- 2 + (y + 1) * 8] = INT16_MAX;
+ tmp[- 1 + (y + 1) * 8] = INT16_MAX;
+ }
+ }
+}
+
+static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+ const uint8_t *src, const ptrdiff_t src_stride,
+ const uint8_t (*left)[2], const uint8_t *const top,
+ const uint8_t *const bottom, const int w, const int h,
+ const enum CdefEdgeFlags edges)
+{
+ const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+ u16x8 l0h, l0l;
+ u16x8 l1h, l1l;
+
+ int y_start = -2, y_end = h + 2;
+
+ // Copy top and bottom first
+ if (!(edges & CDEF_HAVE_TOP)) {
+ l0h = fill;
+ l0l = fill;
+ l1h = fill;
+ l1l = fill;
+ y_start = 0;
+ } else {
+ u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
+ u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
+ l0h = u8h_to_u16(l0);
+ l0l = u8l_to_u16(l0);
+ l1h = u8h_to_u16(l1);
+ l1l = u8l_to_u16(l1);
+ }
+
+ vec_st(l0h, 0, tmp - 4 * 8);
+ vec_st(l0l, 0, tmp - 3 * 8);
+ vec_st(l1h, 0, tmp - 2 * 8);
+ vec_st(l1l, 0, tmp - 1 * 8);
+
+ if (!(edges & CDEF_HAVE_BOTTOM)) {
+ l0h = fill;
+ l0l = fill;
+ l1h = fill;
+ l1l = fill;
+ y_end -= 2;
+ } else {
+ u8x16 l0 = vec_vsx_ld(0, bottom + 0 * src_stride - 2);
+ u8x16 l1 = vec_vsx_ld(0, bottom + 1 * src_stride - 2);
+ l0h = u8h_to_u16(l0);
+ l0l = u8l_to_u16(l0);
+ l1h = u8h_to_u16(l1);
+ l1l = u8l_to_u16(l1);
+ }
+
+ vec_st(l0h, 0, tmp + (h + 0) * 16);
+ vec_st(l0l, 0, tmp + (h + 0) * 16 + 8);
+ vec_st(l1h, 0, tmp + (h + 1) * 16);
+ vec_st(l1l, 0, tmp + (h + 1) * 16 + 8);
+
+ int y_with_left_edge = 0;
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ u8x16 l = vec_vsx_ld(0, src);
+ u16x8 lh = u8h_to_u16(l);
+ u16x8 ll = u8l_to_u16(l);
+ vec_vsx_st(lh, 0, tmp + 2);
+ vec_vsx_st(ll, 0, tmp + 8 + 2);
+
+ y_with_left_edge = 1;
+ }
+
+ for (int y = y_with_left_edge; y < h; y++) {
+ u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride);
+ u16x8 lh = u8h_to_u16(l);
+ u16x8 ll = u8l_to_u16(l);
+ vec_st(lh, 0, tmp + y * 16);
+ vec_st(ll, 0, tmp + 8 + y * 16);
+ }
+
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[y * 16] = INT16_MAX;
+ tmp[1 + y * 16] = INT16_MAX;
+ }
+ } else {
+ for (int y = 0; y < h; y++) {
+ tmp[y * 16] = left[y][0];
+ tmp[1 + y * 16] = left[y][1];
+ }
+ }
+ if (!(edges & CDEF_HAVE_RIGHT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[- 6 + (y + 1) * 16] = INT16_MAX;
+ tmp[- 5 + (y + 1) * 16] = INT16_MAX;
+ }
+ }
+}
+
+static inline i16x8 max_mask(i16x8 a, i16x8 b) {
+ const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX);
+
+ const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX);
+
+ const i16x8 val = vec_sel(a, b, mask);
+
+ return vec_max(val, b);
+}
+
+#define LOAD_PIX(addr) \
+ const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
+ i16x8 max = px; \
+ i16x8 min = px; \
+ i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_PIX4(addr) \
+ const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
+ const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
+ const i16x8 px = vec_xxpermdi(a, b, 0); \
+ i16x8 max = px; \
+ i16x8 min = px; \
+ i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_DIR(p, addr, o0, o1) \
+ const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \
+ const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \
+ const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \
+ const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1);
+
+#define LOAD_DIR4(p, addr, o0, o1) \
+ LOAD_DIR(p ## a, addr, o0, o1) \
+ LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
+ const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
+ const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
+ const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
+ const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
+
+#define CONSTRAIN(p, strength) \
+ const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
+ const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
+ const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
+ const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
+\
+ i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
+ i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
+ i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
+ i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
+
+#define MIN_MAX(p) \
+ max = max_mask(p ## 0, max); \
+ min = vec_min(p ## 0, min); \
+ max = max_mask(p ## 1, max); \
+ min = vec_min(p ## 1, min); \
+ max = max_mask(p ## 2, max); \
+ min = vec_min(p ## 2, min); \
+ max = max_mask(p ## 3, max); \
+ min = vec_min(p ## 3, min);
+
+#define PRI_0(p) \
+ p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
+ p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
+
+#define PRI_1(p) \
+ p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
+ p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
+
+#define SEC_0(p) \
+ p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
+ p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
+ p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
+ p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
+
+#define UPDATE_SUM(p) \
+ const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
+ const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \
+ sum = vec_add(sum, p ## sum0); \
+ sum = vec_add(sum, p ## sum1);
+
+static inline void
+filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int sec_strength, const int dir,
+ const int damping, const enum CdefEdgeFlags edges,
+ const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+ const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+ { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
+ { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
+ };
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+ const int off1 = cdef_directions[dir][0];
+ const int off1_1 = cdef_directions[dir][1];
+
+ const int off2 = cdef_directions[(dir + 2) & 7][0];
+ const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+ copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+
+ for (int y = 0; y < h / 2; y++) {
+ LOAD_PIX4(tmp)
+
+ // Primary pass
+ LOAD_DIR4(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength)
+
+ MIN_MAX(p)
+
+ PRI_0(p)
+ PRI_1(p)
+
+ UPDATE_SUM(p)
+
+ // Secondary pass 1
+ LOAD_DIR4(s, tmp, off2, off3)
+
+ CONSTRAIN(s, sec_strength)
+
+ MIN_MAX(s)
+
+ SEC_0(s)
+
+ UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR4(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength)
+
+ MIN_MAX(s2)
+
+ UPDATE_SUM(s2)
+
+ // Store
+ i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+ bias = vec_sub(vec_splat_s16(8), bias);
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+ dst[0] = vdst[0];
+ dst[1] = vdst[1];
+ dst[2] = vdst[2];
+ dst[3] = vdst[3];
+
+ tmp += tmp_stride;
+ dst += PXSTRIDE(dst_stride);
+ dst[0] = vdst[4];
+ dst[1] = vdst[5];
+ dst[2] = vdst[6];
+ dst[3] = vdst[7];
+
+ tmp += tmp_stride;
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static inline void
+filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int sec_strength, const int dir,
+ const int damping, const enum CdefEdgeFlags edges,
+ const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+ const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+ { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
+ { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
+ };
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+
+
+ const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+ const int off1 = cdef_directions[dir][0];
+ const int off1_1 = cdef_directions[dir][1];
+
+ const int off2 = cdef_directions[(dir + 2) & 7][0];
+ const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+ copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+
+ for (int y = 0; y < h; y++) {
+ LOAD_PIX(tmp)
+
+ // Primary pass
+ LOAD_DIR(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength)
+
+ MIN_MAX(p)
+
+ PRI_0(p)
+ PRI_1(p)
+
+ UPDATE_SUM(p)
+
+ // Secondary pass 1
+ LOAD_DIR(s, tmp, off2, off3)
+
+ CONSTRAIN(s, sec_strength)
+
+ MIN_MAX(s)
+
+ SEC_0(s)
+
+ UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength)
+
+ MIN_MAX(s2)
+
+ UPDATE_SUM(s2)
+
+ // Store
+ i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+ bias = vec_sub(vec_splat_s16(8), bias);
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+ dst[0] = vdst[0];
+ dst[1] = vdst[1];
+ dst[2] = vdst[2];
+ dst[3] = vdst[3];
+ dst[4] = vdst[4];
+ dst[5] = vdst[5];
+ dst[6] = vdst[6];
+ dst[7] = vdst[7];
+
+ tmp += tmp_stride;
+ dst += PXSTRIDE(dst_stride);
+ }
+
+}
+
+#define cdef_fn(w, h, tmp_stride) \
+void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges) \
+{ \
+ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
+ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
+ filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
+ sec_strength, dir, damping, edges, tmp_stride, tmp); \
+}
+
+cdef_fn(4, 4, 8);
+cdef_fn(4, 8, 8);
+cdef_fn(8, 8, 16);
+#endif
diff --git a/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration.h
new file mode 100644
index 00000000000..3fe16318bd5
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Michail Alvanos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/intops.h"
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_ppc(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+ c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_vsx;
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration_tmpl.c
new file mode 100644
index 00000000000..c0c64e18002
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/ppc/looprestoration_tmpl.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Michail Alvanos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/ppc/dav1d_types.h"
+#include "src/ppc/looprestoration.h"
+
+#if BITDEPTH == 8
+
+#define REST_UNIT_STRIDE (400)
+
+static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) {
+ v = vec_max(minv, v);
+ v = vec_min(maxv, v);
+ return v;
+}
+
+#define APPLY_FILTER_H(v, f, ssum1, ssum2) do { \
+ i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \
+ i16x8 ktmp_u16_low = (i16x8) u8l_to_u16(v); \
+ ssum1 = vec_madd(ktmp_u16_high, f, ssum1); \
+ ssum2 = vec_madd(ktmp_u16_low, f, ssum2); \
+} while (0)
+
+static void wiener_filter_h_vsx(int32_t *hor_ptr,
+ uint8_t *tmp_ptr,
+ const int16_t filterh[8],
+ const int w, const int h)
+{
+ const i32x4 zerov = vec_splats(0);
+ const i32x4 seven_vec = vec_splats(7);
+ const i32x4 bitdepth_added_vec = vec_splats(1 << 14);
+ const i32x4 round_bits_vec = vec_splats(3);
+ const i32x4 rounding_off_vec = vec_splats(1<<2);
+ const i32x4 clip_limit_v = vec_splats((1 << 13) - 1);
+
+ i16x8 filterhvall = vec_vsx_ld(0, filterh);
+ i16x8 filterhv0 = vec_splat( filterhvall, 0);
+ i16x8 filterhv1 = vec_splat( filterhvall, 1);
+ i16x8 filterhv2 = vec_splat( filterhvall, 2);
+ i16x8 filterhv3 = vec_splat( filterhvall, 3);
+ i16x8 filterhv4 = vec_splat( filterhvall, 4);
+ i16x8 filterhv5 = vec_splat( filterhvall, 5);
+ i16x8 filterhv6 = vec_splat( filterhvall, 6);
+
+ for (int j = 0; j < h + 6; j++) {
+ for (int i = 0; i < w; i+=16) {
+ i32x4 sum1 = bitdepth_added_vec;
+ i32x4 sum2 = bitdepth_added_vec;
+ i32x4 sum3 = bitdepth_added_vec;
+ i32x4 sum4 = bitdepth_added_vec;
+
+ u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]);
+ u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]);
+
+ u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15);
+ u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14);
+ u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13);
+ u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12);
+ u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11);
+ u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10);
+
+ u16x8 tmp_u16_high = u8h_to_u16(tmp_v3);
+ u16x8 tmp_u16_low = u8l_to_u16(tmp_v3);
+
+ i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high);
+ i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high);
+ i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low);
+ i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low);
+
+ i16x8 ssum1 = (i16x8) zerov;
+ i16x8 ssum2 = (i16x8) zerov;
+
+ APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2);
+
+ sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec);
+ sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec);
+ sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec);
+ sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec);
+
+ sum1 = (sum1 + rounding_off_vec) >> round_bits_vec;
+ sum2 = (sum2 + rounding_off_vec) >> round_bits_vec;
+ sum3 = (sum3 + rounding_off_vec) >> round_bits_vec;
+ sum4 = (sum4 + rounding_off_vec) >> round_bits_vec;
+
+ sum1 = iclip_vec(sum1, zerov, clip_limit_v);
+ sum2 = iclip_vec(sum2, zerov, clip_limit_v);
+ sum3 = iclip_vec(sum3, zerov, clip_limit_v);
+ sum4 = iclip_vec(sum4, zerov, clip_limit_v);
+
+ vec_st(sum1, 0, &hor_ptr[i]);
+ vec_st(sum2, 16, &hor_ptr[i]);
+ vec_st(sum3, 32, &hor_ptr[i]);
+ vec_st(sum4, 48, &hor_ptr[i]);
+ }
+ tmp_ptr += REST_UNIT_STRIDE;
+ hor_ptr += REST_UNIT_STRIDE;
+ }
+}
+
+static inline i16x8 iclip_u8_vec(i16x8 v) {
+ const i16x8 zerov = vec_splats((int16_t)0);
+ const i16x8 maxv = vec_splats((int16_t)255);
+ v = vec_max(zerov, v);
+ v = vec_min(maxv, v);
+ return v;
+}
+
+#define APPLY_FILTER_V(index, f) do { \
+ i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ sum1 = sum1 + v1 * f; \
+ sum2 = sum2 + v2 * f; \
+ sum3 = sum3 + v3 * f; \
+ sum4 = sum4 + v4 * f; \
+} while (0)
+
+#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
+ i32x4 sum1 = round_vec; \
+ i32x4 sum2 = round_vec; \
+ i32x4 sum3 = round_vec; \
+ i32x4 sum4 = round_vec; \
+ APPLY_FILTER_V(0, filterv0); \
+ APPLY_FILTER_V(1, filterv1); \
+ APPLY_FILTER_V(2, filterv2); \
+ APPLY_FILTER_V(3, filterv3); \
+ APPLY_FILTER_V(4, filterv4); \
+ APPLY_FILTER_V(5, filterv5); \
+ APPLY_FILTER_V(6, filterv6); \
+ sum1 = sum1 >> round_bits_vec; \
+ sum2 = sum2 >> round_bits_vec; \
+ sum3 = sum3 >> round_bits_vec; \
+ sum4 = sum4 >> round_bits_vec; \
+ i16x8 sum_short_packed_1 = (i16x8) vec_pack(sum1, sum2); \
+ i16x8 sum_short_packed_2 = (i16x8) vec_pack(sum3, sum4); \
+ sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
+ sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
+ sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2); \
+} while (0)
+
+static inline void wiener_filter_v_vsx(uint8_t *p,
+ const ptrdiff_t stride,
+ const int32_t *hor,
+ const int16_t filterv[8],
+ const int w, const int h)
+{
+ const i32x4 round_bits_vec = vec_splats(11);
+ const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18));
+
+ i32x4 filterv0 = vec_splats((int32_t) filterv[0]);
+ i32x4 filterv1 = vec_splats((int32_t) filterv[1]);
+ i32x4 filterv2 = vec_splats((int32_t) filterv[2]);
+ i32x4 filterv3 = vec_splats((int32_t) filterv[3]);
+ i32x4 filterv4 = vec_splats((int32_t) filterv[4]);
+ i32x4 filterv5 = vec_splats((int32_t) filterv[5]);
+ i32x4 filterv6 = vec_splats((int32_t) filterv[6]);
+
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i <(w-w%16); i += 16) {
+ u8x16 sum_pixel;
+ LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+ vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(stride) + i]);
+ }
+ // remaining loop
+ if (w & 0xf){
+ int i=w-w%16;
+ ALIGN_STK_16(uint8_t, tmp_out, 16,);
+ u8x16 sum_pixel;
+
+ LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+ vec_vsx_st(sum_pixel, 0, tmp_out);
+
+ for (int k=0; i<w; i++, k++) {
+ p[j * PXSTRIDE(stride) + i] = tmp_out[k];
+ }
+ }
+ }
+}
+
+static inline void padding(uint8_t *dst, const uint8_t *p,
+ const ptrdiff_t stride, const uint8_t (*left)[4],
+ const uint8_t *lpf, int unit_w, const int stripe_h,
+ const enum LrEdgeFlags edges)
+{
+ const int have_left = !!(edges & LR_HAVE_LEFT);
+ const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+ // Copy more pixels if we don't have to pad them
+ unit_w += 3 * have_left + 3 * have_right;
+ uint8_t *dst_l = dst + 3 * !have_left;
+ p -= 3 * have_left;
+ lpf -= 3 * have_left;
+
+ if (edges & LR_HAVE_TOP) {
+ // Copy previous loop filtered rows
+ const uint8_t *const above_1 = lpf;
+ const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
+ pixel_copy(dst_l, above_1, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+ } else {
+ // Pad with first row
+ pixel_copy(dst_l, p, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+ if (have_left) {
+ pixel_copy(dst_l, &left[0][1], 3);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+ }
+ }
+
+ uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+ if (edges & LR_HAVE_BOTTOM) {
+ // Copy next loop filtered rows
+ const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
+ const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+ } else {
+ // Pad with last row
+ const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+ if (have_left) {
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ }
+ }
+
+ // Inner UNIT_WxSTRIPE_H
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+ dst_tl += REST_UNIT_STRIDE;
+ p += PXSTRIDE(stride);
+ }
+
+ if (!have_right) {
+ uint8_t *pad = dst_l + unit_w;
+ uint8_t *row_last = &dst_l[unit_w - 1];
+ // Pad 3x(STRIPE_H+6) with last column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(pad, *row_last, 3);
+ pad += REST_UNIT_STRIDE;
+ row_last += REST_UNIT_STRIDE;
+ }
+ }
+
+ if (!have_left) {
+ // Pad 3x(STRIPE_H+6) with first column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(dst, *dst_l, 3);
+ dst += REST_UNIT_STRIDE;
+ dst_l += REST_UNIT_STRIDE;
+ }
+ } else {
+ dst += 3 * REST_UNIT_STRIDE;
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst, &left[j][1], 3);
+ dst += REST_UNIT_STRIDE;
+ }
+ }
+}
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+
+ // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+ ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
+
+ wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
+ wiener_filter_v_vsx(p, stride, hor, filter[1], w, h);
+}
+#endif
diff --git a/chromium/third_party/dav1d/libdav1d/src/qm.h b/chromium/third_party/dav1d/libdav1d/src/qm.h
index 23b2348a70c..8191c8afa77 100644
--- a/chromium/third_party/dav1d/libdav1d/src/qm.h
+++ b/chromium/third_party/dav1d/libdav1d/src/qm.h
@@ -30,7 +30,7 @@
#include "src/levels.h"
-extern const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
+EXTERN const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
void dav1d_init_qm_tables(void);
diff --git a/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c b/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c
index 0ed4169aa00..3158ef5b023 100644
--- a/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c
+++ b/chromium/third_party/dav1d/libdav1d/src/recon_tmpl.c
@@ -591,7 +591,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
const int dq_shift = imax(0, t_dim->ctx - 2);
- const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
+ const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
unsigned cul_level, dc_sign_level;
if (!dc_tok) {
@@ -608,7 +608,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
- unsigned dc_dq = dq_tbl[0];
+ int dc_dq = dq_tbl[0];
dc_sign_level = (dc_sign - 1) & (2 << 6);
if (qm_tbl) {
@@ -628,7 +628,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
}
cul_level = dc_tok;
dc_dq >>= dq_shift;
- cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign);
+ dc_dq = umin(dc_dq, cf_max + dc_sign);
+ cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
if (rc) ac_qm: {
const unsigned ac_dq = dq_tbl[1];
@@ -638,6 +639,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
+ int dq_sat;
if (rc_tok >= (15 << 11)) {
tok = read_golomb(&ts->msac) + 15;
@@ -654,7 +656,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
}
cul_level += tok;
dq >>= dq_shift;
- cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign);
+ dq_sat = umin(dq, cf_max + sign);
+ cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
rc = rc_tok & 0x3ff;
} while (rc);
@@ -669,13 +672,13 @@ static int decode_coefs(Dav1dTaskContext *const t,
dc_tok &= 0xfffff;
dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
- dc_dq = umin(dc_dq - dc_sign, cf_max);
+ dc_dq = umin(dc_dq, cf_max + dc_sign);
} else {
- dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign;
+ dc_dq = ((dc_dq * dc_tok) >> dq_shift);
assert(dc_dq <= cf_max);
}
cul_level = dc_tok;
- cf[0] = (coef) (dc_dq ^ -dc_sign);
+ cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
if (rc) ac_noqm: {
const unsigned ac_dq = dq_tbl[1];
@@ -684,7 +687,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
if (dbg)
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
- unsigned tok, dq;
+ unsigned tok;
+ int dq;
// residual
if (rc_tok >= (15 << 11)) {
@@ -698,15 +702,15 @@ static int decode_coefs(Dav1dTaskContext *const t,
// dequant, see 7.12.3
dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
- dq = umin(dq - sign, cf_max);
+ dq = umin(dq, cf_max + sign);
} else {
// cannot exceed cf_max, so we can avoid the clipping
tok = rc_tok >> 11;
- dq = ((ac_dq * tok) >> dq_shift) - sign;
+ dq = ((ac_dq * tok) >> dq_shift);
assert(dq <= cf_max);
}
cul_level += tok;
- cf[rc] = (coef) (dq ^ -sign);
+ cf[rc] = (coef) (sign ? -dq : dq);
rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
} while (rc);
@@ -1092,9 +1096,10 @@ static int obmc(Dav1dTaskContext *const t,
// only odd blocks are considered for overlap handling, hence +1
const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
+ const int step4 = iclip(a_b_dim[0], 2, 16);
if (a_r->ref.ref[0] > 0) {
- const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]);
+ const int ow4 = imin(step4, b_dim[0]);
const int oh4 = imin(b_dim[1], 16) >> 1;
res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
t->bx + x, t->by, pl, a_r->mv.mv[0],
@@ -1105,7 +1110,7 @@ static int obmc(Dav1dTaskContext *const t,
h_mul * ow4, v_mul * oh4);
i++;
}
- x += imax(a_b_dim[0], 2);
+ x += step4;
}
}
@@ -1114,10 +1119,11 @@ static int obmc(Dav1dTaskContext *const t,
// only odd blocks are considered for overlap handling, hence +1
const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
+ const int step4 = iclip(l_b_dim[1], 2, 16);
if (l_r->ref.ref[0] > 0) {
const int ow4 = imin(b_dim[0], 16) >> 1;
- const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
+ const int oh4 = imin(step4, b_dim[1]);
res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
t->bx, t->by + y, pl, l_r->mv.mv[0],
&f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
@@ -1127,7 +1133,7 @@ static int obmc(Dav1dTaskContext *const t,
dst_stride, lap, h_mul * ow4, v_mul * oh4);
i++;
}
- y += imax(l_b_dim[1], 2);
+ y += step4;
}
return 0;
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/ref.c b/chromium/third_party/dav1d/libdav1d/src/ref.c
index 3889cba5657..46462b4c801 100644
--- a/chromium/third_party/dav1d/libdav1d/src/ref.c
+++ b/chromium/third_party/dav1d/libdav1d/src/ref.c
@@ -88,22 +88,18 @@ Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
return res;
}
-void dav1d_ref_inc(Dav1dRef *const ref) {
- atomic_fetch_add(&ref->ref_cnt, 1);
-}
-
void dav1d_ref_dec(Dav1dRef **const pref) {
assert(pref != NULL);
Dav1dRef *const ref = *pref;
if (!ref) return;
+ *pref = NULL;
if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
const int free_ref = ref->free_ref;
ref->free_callback(ref->const_data, ref->user_data);
if (free_ref) free(ref);
}
- *pref = NULL;
}
int dav1d_ref_is_writable(Dav1dRef *const ref) {
diff --git a/chromium/third_party/dav1d/libdav1d/src/ref.h b/chromium/third_party/dav1d/libdav1d/src/ref.h
index 54f5f69f888..ec070a0a9a3 100644
--- a/chromium/third_party/dav1d/libdav1d/src/ref.h
+++ b/chromium/third_party/dav1d/libdav1d/src/ref.h
@@ -50,9 +50,11 @@ Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size);
Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr,
void (*free_callback)(const uint8_t *data, void *user_data),
void *user_data);
-void dav1d_ref_inc(Dav1dRef *ref);
void dav1d_ref_dec(Dav1dRef **ref);
-
int dav1d_ref_is_writable(Dav1dRef *ref);
+static inline void dav1d_ref_inc(Dav1dRef *const ref) {
+ atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed);
+}
+
#endif /* DAV1D_SRC_REF_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/refmvs.c b/chromium/third_party/dav1d/libdav1d/src/refmvs.c
index d49ebaeec6b..c7ed9db8cac 100644
--- a/chromium/third_party/dav1d/libdav1d/src/refmvs.c
+++ b/chromium/third_party/dav1d/libdav1d/src/refmvs.c
@@ -922,15 +922,23 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
} while (--bh4);
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/refmvs.h"
+#elif ARCH_X86
+#include "src/x86/refmvs.h"
+#endif
+#endif
+
COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
{
c->splat_mv = splat_mv_c;
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- dav1d_refmvs_dsp_init_arm(c);
+ refmvs_dsp_init_arm(c);
#elif ARCH_X86
- dav1d_refmvs_dsp_init_x86(c);
+ refmvs_dsp_init_x86(c);
#endif
#endif
}
diff --git a/chromium/third_party/dav1d/libdav1d/src/scan.h b/chromium/third_party/dav1d/libdav1d/src/scan.h
index ca9743fd5ab..09df9887799 100644
--- a/chromium/third_party/dav1d/libdav1d/src/scan.h
+++ b/chromium/third_party/dav1d/libdav1d/src/scan.h
@@ -32,6 +32,6 @@
#include "src/levels.h"
-extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
+EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
#endif /* DAV1D_SRC_SCAN_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/tables.h b/chromium/third_party/dav1d/libdav1d/src/tables.h
index 894f8c237d9..f3c00cfb00a 100644
--- a/chromium/third_party/dav1d/libdav1d/src/tables.h
+++ b/chromium/third_party/dav1d/libdav1d/src/tables.h
@@ -34,38 +34,38 @@
#include "src/levels.h"
-extern const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS];
-extern const uint8_t /* enum BlockSize */
+EXTERN const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS];
+EXTERN const uint8_t /* enum BlockSize */
dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2];
// width, height (in 4px blocks), log2 versions of these two
-extern const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
+EXTERN const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
typedef struct TxfmInfo {
// width, height (in 4px blocks), log2 of them, min/max of log2, sub, pad
uint8_t w, h, lw, lh, min, max, sub, ctx;
} TxfmInfo;
-extern const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES];
-extern const uint8_t /* enum (Rect)TxfmSize */
+EXTERN const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES];
+EXTERN const uint8_t /* enum (Rect)TxfmSize */
dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */];
-extern const uint8_t /* enum TxfmType */
+EXTERN const uint8_t /* enum TxfmType */
dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES];
-extern const uint8_t /* enum InterPredMode */
+EXTERN const uint8_t /* enum InterPredMode */
dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2];
-extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
-extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
+EXTERN const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
+EXTERN const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
-extern const uint8_t dav1d_filter_mode_to_y_mode[5];
-extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
-extern const uint8_t dav1d_lo_ctx_offsets[3][5][5];
-extern const uint8_t dav1d_skip_ctx[5][5];
-extern const uint8_t /* enum TxClass */
+EXTERN const uint8_t dav1d_filter_mode_to_y_mode[5];
+EXTERN const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
+EXTERN const uint8_t dav1d_lo_ctx_offsets[3][5][5];
+EXTERN const uint8_t dav1d_skip_ctx[5][5];
+EXTERN const uint8_t /* enum TxClass */
dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
-extern const uint8_t /* enum Filter2d */
+EXTERN const uint8_t /* enum Filter2d */
dav1d_filter_2d[DAV1D_N_FILTERS /* h */][DAV1D_N_FILTERS /* v */];
-extern const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
-extern const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
-extern const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];
+EXTERN const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
+EXTERN const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
+EXTERN const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];
static const unsigned cfl_allowed_mask =
(1 << BS_32x32) |
@@ -103,23 +103,23 @@ static const unsigned interintra_allowed_mask =
(1 << BS_8x16) |
(1 << BS_8x8);
-extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
+EXTERN const Dav1dWarpedMotionParams dav1d_default_wm_params;
-extern const int8_t dav1d_cdef_directions[12][2];
+EXTERN const int8_t dav1d_cdef_directions[12][2];
-extern const uint16_t dav1d_sgr_params[16][2];
-extern const uint8_t dav1d_sgr_x_by_x[256];
+EXTERN const uint16_t dav1d_sgr_params[16][2];
+EXTERN const uint8_t dav1d_sgr_x_by_x[256];
-extern const int8_t dav1d_mc_subpel_filters[6][15][8];
-extern const int8_t dav1d_mc_warp_filter[193][8];
-extern const int8_t dav1d_resize_filter[64][8];
+EXTERN const int8_t dav1d_mc_subpel_filters[6][15][8];
+EXTERN const int8_t dav1d_mc_warp_filter[193][8];
+EXTERN const int8_t dav1d_resize_filter[64][8];
-extern const uint8_t dav1d_sm_weights[128];
-extern const uint16_t dav1d_dr_intra_derivative[44];
-extern const int8_t dav1d_filter_intra_taps[5][64];
+EXTERN const uint8_t dav1d_sm_weights[128];
+EXTERN const uint16_t dav1d_dr_intra_derivative[44];
+EXTERN const int8_t dav1d_filter_intra_taps[5][64];
-extern const uint8_t dav1d_obmc_masks[64];
+EXTERN const uint8_t dav1d_obmc_masks[64];
-extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs
+EXTERN const int16_t dav1d_gaussian_sequence[2048]; // for fgs
#endif /* DAV1D_SRC_TABLES_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/thread_task.c b/chromium/third_party/dav1d/libdav1d/src/thread_task.c
index 53aa41e5c8a..ab2376c30a4 100644
--- a/chromium/third_party/dav1d/libdav1d/src/thread_task.c
+++ b/chromium/third_party/dav1d/libdav1d/src/thread_task.c
@@ -49,9 +49,13 @@ static inline int reset_task_cur(const Dav1dContext *const c,
unsigned frame_idx)
{
const unsigned first = atomic_load(&ttd->first);
+ unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
+ if (reset_frame_idx < first) {
+ if (frame_idx == UINT_MAX) return 0;
+ reset_frame_idx = UINT_MAX;
+ }
if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL)
return 0;
- unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
if (reset_frame_idx != UINT_MAX) {
if (frame_idx == UINT_MAX) {
if (reset_frame_idx > first + ttd->cur)
@@ -78,12 +82,17 @@ cur_found:
static inline void reset_task_cur_async(struct TaskThreadData *const ttd,
unsigned frame_idx, unsigned n_frames)
{
- if (frame_idx < (unsigned)atomic_load(&ttd->first)) frame_idx += n_frames;
+ const unsigned first = atomic_load(&ttd->first);
+ if (frame_idx < first) frame_idx += n_frames;
unsigned last_idx = frame_idx;
do {
frame_idx = last_idx;
last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx);
} while (last_idx < frame_idx);
+ if (frame_idx == first && atomic_load(&ttd->first) != first) {
+ unsigned expected = frame_idx;
+ atomic_compare_exchange_strong(&ttd->reset_task_cur, &expected, UINT_MAX);
+ }
}
static void insert_tasks_between(Dav1dFrameContext *const f,
@@ -164,6 +173,43 @@ static inline void insert_task(Dav1dFrameContext *const f,
insert_tasks(f, t, t, cond_signal);
}
+static inline void add_pending(Dav1dFrameContext *const f, Dav1dTask *const t) {
+ pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+ t->next = NULL;
+ if (!f->task_thread.pending_tasks.head)
+ f->task_thread.pending_tasks.head = t;
+ else
+ f->task_thread.pending_tasks.tail->next = t;
+ f->task_thread.pending_tasks.tail = t;
+ atomic_store(&f->task_thread.pending_tasks.merge, 1);
+ pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+}
+
+static inline int merge_pending_frame(Dav1dFrameContext *const f) {
+ int const merge = atomic_load(&f->task_thread.pending_tasks.merge);
+ if (merge) {
+ pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+ Dav1dTask *t = f->task_thread.pending_tasks.head;
+ f->task_thread.pending_tasks.head = NULL;
+ f->task_thread.pending_tasks.tail = NULL;
+ atomic_store(&f->task_thread.pending_tasks.merge, 0);
+ pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+ while (t) {
+ Dav1dTask *const tmp = t->next;
+ insert_task(f, t, 0);
+ t = tmp;
+ }
+ }
+ return merge;
+}
+
+static inline int merge_pending(const Dav1dContext *const c) {
+ int res = 0;
+ for (unsigned i = 0; i < c->n_fc; i++)
+ res |= merge_pending_frame(&c->fc[i]);
+ return res;
+}
+
static int create_filter_sbrow(Dav1dFrameContext *const f,
const int pass, Dav1dTask **res_t)
{
@@ -192,13 +238,14 @@ static int create_filter_sbrow(Dav1dFrameContext *const f,
const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
if (prog_sz > f->frame_thread.prog_sz) {
atomic_uint *const prog = realloc(f->frame_thread.frame_progress,
- prog_sz * 2 * sizeof(*prog));
+ 2 * prog_sz * sizeof(*prog));
if (!prog) return -1;
f->frame_thread.frame_progress = prog;
f->frame_thread.copy_lpf_progress = prog + prog_sz;
- f->frame_thread.prog_sz = prog_sz;
}
- memset(f->frame_thread.frame_progress, 0, prog_sz * 2 * sizeof(atomic_uint));
+ f->frame_thread.prog_sz = prog_sz;
+ memset(f->frame_thread.frame_progress, 0, prog_sz * sizeof(atomic_uint));
+ memset(f->frame_thread.copy_lpf_progress, 0, prog_sz * sizeof(atomic_uint));
atomic_store(&f->frame_thread.deblock_progress, 0);
}
f->frame_thread.next_tile_row[pass & 1] = 0;
@@ -224,16 +271,18 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
Dav1dTask *tasks = f->task_thread.tile_tasks[0];
const int uses_2pass = f->c->n_fc > 1;
const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
- int alloc_num_tasks = num_tasks * (1 + uses_2pass);
- if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
- const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
- tasks = realloc(f->task_thread.tile_tasks[0], size);
- if (!tasks) return -1;
- memset(tasks, 0, size);
- f->task_thread.tile_tasks[0] = tasks;
- f->task_thread.num_tile_tasks = alloc_num_tasks;
+ if (pass < 2) {
+ int alloc_num_tasks = num_tasks * (1 + uses_2pass);
+ if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
+ const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
+ tasks = realloc(f->task_thread.tile_tasks[0], size);
+ if (!tasks) return -1;
+ memset(tasks, 0, size);
+ f->task_thread.tile_tasks[0] = tasks;
+ f->task_thread.num_tile_tasks = alloc_num_tasks;
+ }
+ f->task_thread.tile_tasks[1] = tasks + num_tasks;
}
- f->task_thread.tile_tasks[1] = tasks + num_tasks;
tasks += num_tasks * (pass & 1);
Dav1dTask *pf_t;
@@ -263,8 +312,22 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
prev_t->next = pf_t;
prev_t = pf_t;
}
- insert_tasks(f, &tasks[0], prev_t, cond_signal);
- f->task_thread.done[pass & 1] = 0;
+ prev_t->next = NULL;
+
+ atomic_store(&f->task_thread.done[pass & 1], 0);
+
+ // XXX in theory this could be done locklessly, at this point they are no
+ // tasks in the frameQ, so no other runner should be using this lock, but
+ // we must add both passes at once
+ pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+ assert(f->task_thread.pending_tasks.head == NULL || pass == 2);
+ if (!f->task_thread.pending_tasks.head)
+ f->task_thread.pending_tasks.head = &tasks[0];
+ else
+ f->task_thread.pending_tasks.tail->next = &tasks[0];
+ f->task_thread.pending_tasks.tail = prev_t;
+ atomic_store(&f->task_thread.pending_tasks.merge, 1);
+ pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
return 0;
}
@@ -272,7 +335,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
void dav1d_task_frame_init(Dav1dFrameContext *const f) {
const Dav1dContext *const c = f->c;
- f->task_thread.init_done = 0;
+ atomic_store(&f->task_thread.init_done, 0);
// schedule init task, which will schedule the remaining tasks
Dav1dTask *const t = &f->task_thread.init_task;
t->type = DAV1D_TASK_TYPE_INIT;
@@ -307,16 +370,12 @@ static inline int ensure_progress(struct TaskThreadData *const ttd,
// so ensure that completed. if not, re-add to task-queue; else, fall-through
int p1 = atomic_load(state);
if (p1 < t->sby) {
+ t->type = type;
+ t->recon_progress = t->deblock_progress = 0;
+ *target = t->sby;
+ add_pending(f, t);
pthread_mutex_lock(&ttd->lock);
- p1 = atomic_load(state);
- if (p1 < t->sby) {
- t->type = type;
- t->recon_progress = t->deblock_progress = 0;
- *target = t->sby;
- insert_task(f, t, 0);
- return 1;
- }
- pthread_mutex_unlock(&ttd->lock);
+ return 1;
}
return 0;
}
@@ -369,11 +428,29 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
return 0;
}
+static inline int get_frame_progress(const Dav1dContext *const c,
+ const Dav1dFrameContext *const f)
+{
+ unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
+ if (frame_prog >= FRAME_ERROR)
+ return f->sbh - 1;
+ int idx = frame_prog >> (f->sb_shift + 7);
+ int prog;
+ do {
+ atomic_uint *state = &f->frame_thread.frame_progress[idx];
+ const unsigned val = ~atomic_load(state);
+ prog = val ? ctz(val) : 32;
+ if (prog != 32) break;
+ prog = 0;
+ } while (++idx < f->frame_thread.prog_sz);
+ return ((idx << 5) | prog) - 1;
+}
+
static inline void abort_frame(Dav1dFrameContext *const f, const int error) {
atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1);
- f->task_thread.task_counter = 0;
- f->task_thread.done[0] = 1;
- f->task_thread.done[1] = 1;
+ atomic_store(&f->task_thread.task_counter, 0);
+ atomic_store(&f->task_thread.done[0], 1);
+ atomic_store(&f->task_thread.done[1], 1);
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
dav1d_decode_frame_exit(f, error);
@@ -478,6 +555,8 @@ void *dav1d_worker_task(void *data) {
for (;;) {
if (tc->task_thread.die) break;
if (atomic_load(c->flush)) goto park;
+
+ merge_pending(c);
if (ttd->delayed_fg.exec) { // run delayed film grain first
delayed_fg_task(c, ttd);
continue;
@@ -488,11 +567,18 @@ void *dav1d_worker_task(void *data) {
for (unsigned i = 0; i < c->n_fc; i++) {
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + i) % c->n_fc];
- if (f->task_thread.init_done) continue;
+ if (atomic_load(&f->task_thread.init_done)) continue;
t = f->task_thread.task_head;
if (!t) continue;
if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
+ // XXX This can be a simple else, if adding tasks of both
+ // passes at once (in dav1d_task_create_tile_sbrow).
+ // Adding the tasks to the pending Q can result in a
+ // thread merging them before setting init_done.
+ // We will need to set init_done before adding to the
+ // pending Q, so maybe return the tasks, set init_done,
+ // and add to pending Q only then.
const int p1 = f->in_cdf.progress ?
atomic_load(f->in_cdf.progress) : 1;
if (p1) {
@@ -505,6 +591,7 @@ void *dav1d_worker_task(void *data) {
while (ttd->cur < c->n_fc) { // run decoding tasks last
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + ttd->cur) % c->n_fc];
+ merge_pending_frame(f);
prev_t = f->task_thread.task_cur_prev;
t = prev_t ? prev_t->next : f->task_thread.task_head;
while (t) {
@@ -519,11 +606,12 @@ void *dav1d_worker_task(void *data) {
} else if (t->recon_progress) {
const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
int error = atomic_load(&f->task_thread.error);
- assert(!f->task_thread.done[p] || error);
+ assert(!atomic_load(&f->task_thread.done[p]) || error);
const int tile_row_base = f->frame_hdr->tiling.cols *
f->frame_thread.next_tile_row[p];
if (p) {
- const int p1 = f->frame_thread.entropy_progress;
+ atomic_int *const prog = &f->frame_thread.entropy_progress;
+ const int p1 = atomic_load(prog);
if (p1 < t->sby) goto next;
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
}
@@ -567,6 +655,7 @@ void *dav1d_worker_task(void *data) {
ttd->cur++;
}
if (reset_task_cur(c, ttd, UINT_MAX)) continue;
+ if (merge_pending(c)) continue;
park:
tc->task_thread.flushed = 1;
pthread_cond_signal(&tc->task_thread.td.cond);
@@ -584,6 +673,7 @@ void *dav1d_worker_task(void *data) {
if (!t->next) f->task_thread.task_tail = prev_t;
if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
ttd->cur++;
+ t->next = NULL;
// we don't need to check cond_signaled here, since we found a task
// after the last signal so we want to re-signal the next waiting thread
// and again won't need to signal after that
@@ -605,13 +695,13 @@ void *dav1d_worker_task(void *data) {
if (res || p1 == TILE_ERROR) {
pthread_mutex_lock(&ttd->lock);
abort_frame(f, res ? res : DAV1D_ERR(EINVAL));
- } else if (!res) {
+ reset_task_cur(c, ttd, t->frame_idx);
+ } else {
t->type = DAV1D_TASK_TYPE_INIT_CDF;
if (p1) goto found_unlocked;
+ add_pending(f, t);
pthread_mutex_lock(&ttd->lock);
- insert_task(f, t, 0);
}
- reset_task_cur(c, ttd, t->frame_idx);
continue;
}
case DAV1D_TASK_TYPE_INIT_CDF: {
@@ -619,7 +709,6 @@ void *dav1d_worker_task(void *data) {
int res = DAV1D_ERR(EINVAL);
if (!atomic_load(&f->task_thread.error))
res = dav1d_decode_frame_init_cdf(f);
- pthread_mutex_lock(&ttd->lock);
if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
}
@@ -628,23 +717,34 @@ void *dav1d_worker_task(void *data) {
for (int p = 1; p <= 2; p++) {
const int res = dav1d_task_create_tile_sbrow(f, p, 0);
if (res) {
+ pthread_mutex_lock(&ttd->lock);
// memory allocation failed
- f->task_thread.done[2 - p] = 1;
+ atomic_store(&f->task_thread.done[2 - p], 1);
atomic_store(&f->task_thread.error, -1);
- f->task_thread.task_counter -= f->sbh +
- f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+ atomic_fetch_sub(&f->task_thread.task_counter,
+ f->frame_hdr->tiling.cols *
+ f->frame_hdr->tiling.rows + f->sbh);
atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
- if (p == 2 && f->task_thread.done[1]) {
- assert(!f->task_thread.task_counter);
+ if (p == 2 && atomic_load(&f->task_thread.done[1])) {
+ assert(!atomic_load(&f->task_thread.task_counter));
dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
+ atomic_store(&f->task_thread.init_done, 1);
+ continue;
+ } else {
+ pthread_mutex_unlock(&ttd->lock);
}
}
}
- } else abort_frame(f, res);
- reset_task_cur(c, ttd, t->frame_idx);
- f->task_thread.init_done = 1;
+ atomic_store(&f->task_thread.init_done, 1);
+ pthread_mutex_lock(&ttd->lock);
+ } else {
+ pthread_mutex_lock(&ttd->lock);
+ abort_frame(f, res);
+ reset_task_cur(c, ttd, t->frame_idx);
+ atomic_store(&f->task_thread.init_done, 1);
+ }
continue;
}
case DAV1D_TASK_TYPE_TILE_ENTROPY:
@@ -673,10 +773,9 @@ void *dav1d_worker_task(void *data) {
pthread_cond_signal(&ttd->cond);
goto found_unlocked;
}
- pthread_mutex_lock(&ttd->lock);
atomic_store(&ts->progress[p], progress);
- reset_task_cur(c, ttd, t->frame_idx);
- insert_task(f, t, 0);
+ add_pending(f, t);
+ pthread_mutex_lock(&ttd->lock);
} else {
pthread_mutex_lock(&ttd->lock);
atomic_store(&ts->progress[p], progress);
@@ -692,15 +791,16 @@ void *dav1d_worker_task(void *data) {
if (c->n_fc > 1)
atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1);
}
- if (!--f->task_thread.task_counter && f->task_thread.done[0] &&
- (!uses_2pass || f->task_thread.done[1]))
+ if (atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1 == 0 &&
+ atomic_load(&f->task_thread.done[0]) &&
+ (!uses_2pass || atomic_load(&f->task_thread.done[1])))
{
dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
error ? DAV1D_ERR(ENOMEM) : 0);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
- assert(f->task_thread.task_counter >= 0);
+ assert(atomic_load(&f->task_thread.task_counter) >= 0);
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
pthread_cond_signal(&ttd->cond);
}
@@ -734,15 +834,11 @@ void *dav1d_worker_task(void *data) {
if (sby) {
int prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
if (~prog & (1U << ((sby - 1) & 31))) {
+ t->type = DAV1D_TASK_TYPE_CDEF;
+ t->recon_progress = t->deblock_progress = 0;
+ add_pending(f, t);
pthread_mutex_lock(&ttd->lock);
- prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
- if (~prog & (1U << ((sby - 1) & 31))) {
- t->type = DAV1D_TASK_TYPE_CDEF;
- t->recon_progress = t->deblock_progress = 0;
- insert_task(f, t, 0);
- continue;
- }
- pthread_mutex_unlock(&ttd->lock);
+ continue;
}
}
}
@@ -776,40 +872,53 @@ void *dav1d_worker_task(void *data) {
const int uses_2pass = c->n_fc > 1;
const int sbh = f->sbh;
const int sbsz = f->sb_step * 4;
- const enum PlaneType progress_plane_type =
- t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS ? PLANE_TYPE_BLOCK :
- c->n_fc > 1 ? PLANE_TYPE_Y : PLANE_TYPE_ALL;
- if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS)
- atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
- 1U << (sby & 31));
- pthread_mutex_lock(&ttd->lock);
- if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
- unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
- if (frame_prog < FRAME_ERROR) {
- int idx = frame_prog >> (f->sb_shift + 7);
- int prog;
- do {
- atomic_uint *state = &f->frame_thread.frame_progress[idx];
- const unsigned val = ~atomic_load(state);
- prog = val ? ctz(val) : 32;
- if (prog != 32) break;
- prog = 0;
- } while (++idx < f->frame_thread.prog_sz);
- sby = ((idx << 5) | prog) - 1;
- } else sby = sbh - 1;
+ if (t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
+ error = atomic_load(&f->task_thread.error);
+ const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
+ assert(c->n_fc > 1);
+ if (f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
+ atomic_store(&f->sr_cur.progress[0], error ? FRAME_ERROR : y);
+ atomic_store(&f->frame_thread.entropy_progress,
+ error ? TILE_ERROR : sby + 1);
+ if (sby + 1 == sbh)
+ atomic_store(&f->task_thread.done[1], 1);
+ pthread_mutex_lock(&ttd->lock);
+ const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
+ if (sby + 1 < sbh && num_tasks) {
+ reset_task_cur(c, ttd, t->frame_idx);
+ continue;
+ }
+ if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
+ atomic_load(&f->task_thread.done[1]))
+ {
+ dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+ error ? DAV1D_ERR(ENOMEM) : 0);
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+ }
+ reset_task_cur(c, ttd, t->frame_idx);
+ continue;
}
+ // t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS
+ atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
+ 1U << (sby & 31));
+ pthread_mutex_lock(&f->task_thread.lock);
+ sby = get_frame_progress(c, f);
error = atomic_load(&f->task_thread.error);
const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
- if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */) {
- const int idx = t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
- atomic_store(&f->sr_cur.progress[idx], error ? FRAME_ERROR : y);
- }
- if (progress_plane_type == PLANE_TYPE_BLOCK)
- f->frame_thread.entropy_progress = error ? TILE_ERROR : sby + 1;
+ if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
+ atomic_store(&f->sr_cur.progress[1], error ? FRAME_ERROR : y);
+ pthread_mutex_unlock(&f->task_thread.lock);
if (sby + 1 == sbh)
- f->task_thread.done[progress_plane_type == PLANE_TYPE_BLOCK] = 1;
- if (!--f->task_thread.task_counter &&
- f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1]))
+ atomic_store(&f->task_thread.done[0], 1);
+ pthread_mutex_lock(&ttd->lock);
+ const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
+ if (sby + 1 < sbh && num_tasks) {
+ reset_task_cur(c, ttd, t->frame_idx);
+ continue;
+ }
+ if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
+ (!uses_2pass || atomic_load(&f->task_thread.done[1])))
{
dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
error ? DAV1D_ERR(ENOMEM) : 0);
diff --git a/chromium/third_party/dav1d/libdav1d/src/wedge.h b/chromium/third_party/dav1d/libdav1d/src/wedge.h
index 45f0570a270..586be98c42c 100644
--- a/chromium/third_party/dav1d/libdav1d/src/wedge.h
+++ b/chromium/third_party/dav1d/libdav1d/src/wedge.h
@@ -31,11 +31,11 @@
#include "src/levels.h"
void dav1d_init_wedge_masks(void);
-extern const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
+EXTERN const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
[2 /* sign */][16 /* wedge_idx */];
void dav1d_init_interintra_masks(void);
-extern const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
+EXTERN const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
[N_INTER_INTRA_PRED_MODES];
#endif /* DAV1D_SRC_WEDGE_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h b/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h
new file mode 100644
index 00000000000..553d6507412
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+#define decl_cdef_fns(ext) \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext))
+
+decl_cdef_fns(avx512icl);
+decl_cdef_fns(avx2);
+decl_cdef_fns(sse4);
+decl_cdef_fns(ssse3);
+decl_cdef_fns(sse2);
+
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
+
+static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+#if BITDEPTH == 8
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->dir = BF(dav1d_cdef_dir, ssse3);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+ c->dir = BF(dav1d_cdef_dir, sse4);
+#if BITDEPTH == 8
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->dir = BF(dav1d_cdef_dir, avx2);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm
new file mode 100644
index 00000000000..6d625a02a0c
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm
@@ -0,0 +1,622 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21
+ db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29
+ db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37
+ db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45
+end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111
+ dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011
+ dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111
+pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4
+cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6
+ dw 1, 2, 1, 10, 9, 18, 8, 17
+ dw 8, 16, 8, 15, -7,-14, 1, -6
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28
+ db 2, 4, 2, 36, 34, 68, 32, 66
+ db 32, 64, 32, 62,-30,-60, 2,-28
+pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3
+sec_taps4: dw 32, 16
+pw_m16384: times 2 dw -16384
+pw_2048: times 2 dw 2048
+pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4)
+edge_mask8: dw 0x2121, 0x2020, 0x0101
+
+SECTION .text
+
+%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp
+ psubw %1, %2, %3
+ pabsw %1, %1
+ vpcmpgtw k1, %3, %2
+ vpsrlvw %7, %1, %6
+ psubusw %7, %5, %7
+ pminsw %1, %7
+ vpsubw %1{k1}, %4, %1
+%endmacro
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs4
+ lea r6, [cdef_dirs4]
+ movu xm3, [dstq+strideq*0]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ mova xm2, [leftq]
+ lea r2, [dstq+strideq*2]
+ vinserti32x4 m3, [r2+strideq*0], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m3, [r2+strideq*1], 3
+ vpermt2d m2, m5, m3
+ vinserti32x4 m1, m2, [topq+strideq*0-4], 0
+ vinserti32x4 m1, [topq+strideq*1-4], 1
+ mov r3d, edgem
+ movifnidn prid, prim
+ punpcklwd m3, m3 ; px
+ psrlw m5, 8
+ vpbroadcastd m0, [base+pd_268435568]
+ pxor m12, m12
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m6, m3, m8
+ pmaxsw m7, m3, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ call .constrain_sec
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ psrldq m8, m6, 2
+ vpshldd m3, m0, 8
+ psrldq m9, m7, 2
+ paddd m0, m3
+ pminuw m6, m8
+ psrldq m0, 1
+ pmaxsw m7, m9
+ pmaxsw m0, m6
+ pminsw m0, m7
+ vpmovdw ym0, m0
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ mov r4d, dirm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym1, [base+end_perm4]
+ vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ vpermb m0, m1, m0
+.end:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm0, ym0, 1
+ movq [r2+strideq*0], xm0
+ movhps [r2+strideq*1], xm0
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ or r3d, 0x04
+ vmovdqa32 m1{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m8, m5, m9
+ vpermi2w m8, m1, m2 ; k0p0 k1p0
+ psubw m9, m5, m9
+ vpermi2w m9, m1, m2 ; k0p1 k1p1
+ CONSTRAIN m10, m8, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ CONSTRAIN m10, m9, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ ret
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7
+
+cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+ lea r6, [cdef_dirs4]
+ movu xm18, [dstq+strideq*0]
+ vinserti128 ym18, [dstq+strideq*1], 1
+ mova xm1, [leftq+16*0]
+ mova xm2, [leftq+16*1]
+ lea r2, [strideq*3]
+ vinserti32x4 m18, [dstq+strideq*2], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m18, [dstq+r2 ], 3
+ vpermt2d m1, m5, m18
+ vinserti32x4 m0, m1, [topq+strideq*0-4], 0
+ vinserti32x4 m0, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu xm19, [r3+strideq*0]
+ vinserti128 ym19, [r3+strideq*1], 1
+ vinserti32x4 m19, [r3+strideq*2], 2
+ vinserti32x4 m19, [r3+r2 ], 3
+ mov r3d, edgem
+ movifnidn prid, prim
+ vpermt2d m2, m5, m19
+ vpbroadcastd m16, [base+pd_268435568]
+ pxor m12, m12
+ punpcklwd m18, m18 ; px (top)
+ psrlw m5, 8
+ punpcklwd m19, m19 ; px (bottom)
+ mova m17, m16
+ vshufi32x4 m1, m2, q3210
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m3, m18, m6
+ pmaxsw m4, m18, m6
+ pminuw m20, m19, m7
+ pmaxsw m21, m19, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ call .constrain_sec
+ pminuw m3, m6
+ pmaxsw m4, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m3, m6
+ pmaxsw m4, m6
+ mov r3, 0xcccccccccccccccc
+ pminuw m20, m7
+ pmaxsw m21, m7
+ kmovq k1, r3
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vbroadcasti32x4 m0, [base+deint_shuf]
+ vpshldd m6, m20, m3, 16
+ vmovdqu8 m3{k1}, m20
+ vpshldd m18, m16, 8
+ vpshldd m7, m21, m4, 16
+ vmovdqu8 m4{k1}, m21
+ vpshldd m19, m17, 8
+ pminuw m3, m6
+ paddd m16, m18
+ pmaxsw m4, m7
+ paddd m17, m19
+ psrldq m16, 1
+ palignr m16{k1}, m17, m17, 15
+ lea r6, [dstq+strideq*4]
+ pmaxsw m16, m3
+ pminsw m16, m4
+ pshufb m16, m0
+ movq [dstq+strideq*0], xm16
+ movhps [r6 +strideq*0], xm16
+ vextracti128 xm17, ym16, 1
+ movq [dstq+strideq*1], xm17
+ movhps [r6 +strideq*1], xm17
+ vextracti32x4 xm17, m16, 2
+ movq [dstq+strideq*2], xm17
+ movhps [r6 +strideq*2], xm17
+ vextracti32x4 xm16, m16, 3
+ movq [dstq+r2 ], xm16
+ movhps [r6 +r2 ], xm16
+ RET
+.sec_only:
+ mov r4d, dirm
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym20, [base+end_perm4]
+ vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m19, m17, 8
+ paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddd m17, m19
+ vpermb m16, m20, m16
+ vpermb m17, m20, m17
+ movq [dstq+strideq*0], xm16
+ movhps [dstq+strideq*1], xm16
+ vextracti128 xm16, ym16, 1
+ movq [dstq+strideq*2], xm16
+ movhps [dstq+r2 ], xm16
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm17
+ movhps [dstq+strideq*1], xm17
+ vextracti128 xm17, ym17, 1
+ movq [dstq+strideq*2], xm17
+ movhps [dstq+r2 ], xm17
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ mov r4d, r3d
+ or r3d, 0x0c
+ vmovdqa32 m0{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ or r4d, 0x04
+ vmovdqa32 m1{k1}, m6
+ kmovw k1, [base+edge_mask4-8+r4*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m7, m5, m9
+ mova m6, m0
+ vpermt2w m6, m7, m1 ; k0p0 k1p0 (top)
+ psubw m9, m5, m9
+ mova m8, m0
+ vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom)
+ CONSTRAIN m10, m6, m18, m12, m13, m14, m11
+ vpermt2w m8, m9, m1 ; k0p1 k1p1 (top)
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m7, m19, m12, m13, m14, m11
+ vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom)
+ vpdpwssd m17, m10, m15
+ CONSTRAIN m10, m8, m18, m12, m13, m14, m11
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m9, m19, m12, m13, m14, m11
+ vpdpwssd m17, m10, m15
+ ret
+
+cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs8
+ lea r6, [cdef_dirs8]
+ movu ym17, [dstq+strideq*0]
+ vinserti32x8 m17, [dstq+strideq*1], 1
+ movq xm4, [leftq+8*0]
+ movq xm5, [leftq+8*1]
+ psrld m2, [base+cdef_perm], 16
+ movq xm6, [leftq+8*2]
+ movq xm7, [leftq+8*3]
+ lea r2, [strideq*3]
+ movu ym16, [topq+strideq*0-4]
+ vinserti32x8 m16, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu ym18, [dstq+strideq*2]
+ vinserti32x8 m18, [dstq+r2 ], 1
+ movu ym19, [r3+strideq*0]
+ vinserti32x8 m19, [r3+strideq*1], 1
+ movu ym20, [r3+strideq*2]
+ vinserti32x8 m20, [r3+r2 ], 1
+ vshufi32x4 m0, m17, m18, q2020 ; px (top)
+ mov r3d, edgem
+ vshufi32x4 m1, m19, m20, q2020 ; px (bottom)
+ movifnidn prid, prim
+ vpermt2d m17, m2, m4
+ vpermt2d m18, m2, m5
+ pxor m12, m12
+ vpermt2d m19, m2, m6
+ vpermt2d m20, m2, m7
+ cmp r3d, 0x0f
+ jne .mask_edges
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+.main:
+ mova [rsp+64*0], m16 ; top
+ mova [rsp+64*1], m17 ; 0 1
+ mova [rsp+64*2], m18 ; 2 3
+ mova [rsp+64*3], m19 ; 4 5
+ mova [rsp+64*4], m20 ; 6 7
+ mova [rsp+64*5], m21 ; bottom
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ add r4d, r3d ; pri_shift
+ vpbroadcastw m14, r4d
+ mov r4d, dirm
+ vpbroadcastd m2, [base+pri_taps8+priq*2+0]
+ vpbroadcastd m3, [base+pri_taps8+priq*2+4]
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1
+ pmaxsw m14, m12
+ call .constrain
+ mov r5d, secm
+ pmullw m16, m8, m2
+ pmullw m17, m9, m2
+ test r5d, r5d
+ jnz .pri_sec
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ call .constrain
+ pmullw m8, m3
+ pmullw m9, m3
+ jmp .end_no_clip
+.pri_sec:
+ lzcnt r5d, r5d
+ add r3d, r5d ; sec_shift
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ pminuw m18, m0, m4
+ pmaxsw m19, m0, m4
+ pminuw m20, m1, m5
+ pmaxsw m21, m1, m5
+ call .min_max_constrain2
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2
+ pmullw m8, m3
+ pmullw m9, m3
+ vpbroadcastw m13, secm
+ vpbroadcastw m14, r3d
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3
+ mova m2, m8
+ mova m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2
+ paddw m2, m8
+ paddw m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3
+ paddw m2, m2
+ paddw m3, m3
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m2
+ paddw m17, m3
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ paddw m16, m0
+ paddw m17, m1
+ pmaxsw m16, m18
+ pmaxsw m17, m20
+ pminsw m16, m19
+ pminsw m17, m21
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r4d, dirm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0]
+ vpbroadcastw m14, r3d
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0]
+ mova m16, m8
+ mova m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1]
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1]
+ paddw m16, m16
+ paddw m17, m17
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+.end_no_clip:
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ paddw m16, m0
+ paddw m17, m1
+.end:
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm17
+ vextracti128 [dstq+strideq*1], ym17, 1
+ vextracti32x4 [dstq+strideq*2], m17, 2
+ vextracti32x4 [dstq+r2 ], m17, 3
+ RET
+.mask_edges:
+ vpbroadcastd m2, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+ jmp .mask_edges_top
+.mask_edges_no_bottom:
+ mova m21, m2
+.mask_edges_top:
+ test r3b, 0x04
+ jnz .mask_edges_main
+ mova m16, m2
+.mask_edges_main:
+ and r3d, 0x03
+ cmp r3d, 0x03
+ je .main
+ kmovw k1, [base+edge_mask8+r3*2]
+ vmovdqa32 m16{k1}, m2 ; edge pixels = -16384
+ vmovdqa32 m17{k1}, m2
+ vmovdqa32 m18{k1}, m2
+ vmovdqa32 m19{k1}, m2
+ vmovdqa32 m20{k1}, m2
+ vmovdqa32 m21{k1}, m2
+ jmp .main
+ALIGN function_align
+.min_max_constrain:
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+.min_max_constrain2:
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+.constrain:
+ %define tmp rsp+gprsize+68
+ movu m4, [tmp+r5+64*0]
+ vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top)
+ movu m5, [tmp+r5+64*2]
+ vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom)
+ neg r5
+ movu m6, [tmp+r5+64*0]
+ vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top)
+ movu m7, [tmp+r5+64*2]
+ vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom)
+ CONSTRAIN m8, m4, m0, m12, m13, m14, m15
+ CONSTRAIN m9, m5, m1, m12, m13, m14, m15
+ CONSTRAIN m10, m6, m0, m12, m13, m14, m15
+ CONSTRAIN m11, m7, m1, m12, m13, m14, m15
+ paddw m8, m10
+ paddw m9, m11
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h
new file mode 100644
index 00000000000..eeaa328d1e1
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+
+#define decl_fg_fns(ext) \
+decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \
+decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext))
+
+decl_fg_fns(ssse3);
+decl_fg_fns(avx2);
+decl_fg_fns(avx512icl);
+
+static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3);
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h
new file mode 100644
index 00000000000..7df563fee1c
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(dav1d_##name, ssse3)); \
+ decl_##type##_fn(BF(dav1d_##name, avx2)); \
+ decl_##type##_fn(BF(dav1d_##name, avx512icl))
+#define init_fn(type0, type1, name, suffix) \
+ c->type0[type1] = BF(dav1d_##name, suffix)
+
+#define init_angular_ipred_fn(type, name, suffix) \
+ init_fn(intra_pred, type, name, suffix)
+#define init_cfl_pred_fn(type, name, suffix) \
+ init_fn(cfl_pred, type, name, suffix)
+#define init_cfl_ac_fn(type, name, suffix) \
+ init_fn(cfl_ac, type, name, suffix)
+
+decl_fn(angular_ipred, ipred_dc);
+decl_fn(angular_ipred, ipred_dc_128);
+decl_fn(angular_ipred, ipred_dc_top);
+decl_fn(angular_ipred, ipred_dc_left);
+decl_fn(angular_ipred, ipred_h);
+decl_fn(angular_ipred, ipred_v);
+decl_fn(angular_ipred, ipred_paeth);
+decl_fn(angular_ipred, ipred_smooth);
+decl_fn(angular_ipred, ipred_smooth_h);
+decl_fn(angular_ipred, ipred_smooth_v);
+decl_fn(angular_ipred, ipred_z1);
+decl_fn(angular_ipred, ipred_z2);
+decl_fn(angular_ipred, ipred_z3);
+decl_fn(angular_ipred, ipred_filter);
+
+decl_fn(cfl_pred, ipred_cfl);
+decl_fn(cfl_pred, ipred_cfl_128);
+decl_fn(cfl_pred, ipred_cfl_top);
+decl_fn(cfl_pred, ipred_cfl_left);
+
+decl_fn(cfl_ac, ipred_cfl_ac_420);
+decl_fn(cfl_ac, ipred_cfl_ac_422);
+decl_fn(cfl_ac, ipred_cfl_ac_444);
+
+decl_fn(pal_pred, pal_pred);
+
+static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3);
+
+ c->pal_pred = BF(dav1d_pal_pred, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx2);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx2);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx2);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2);
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
+#endif
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm
index 4a1b060bd5f..1a307adc985 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm
@@ -114,20 +114,20 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
vbroadcasti32x4 m2, [tlq]
pshufb m2, m7 ; left
PAETH 4, 5, 6
- vextracti32x4 xmm1, m0, 2
- vextracti32x4 xmm2, ym0, 1
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm8, ym0, 1
+ vextracti32x4 xm9, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+r6 ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm8
+ movq [dstq+r6 ], xm9
sub hd, 8
jl .w4_end
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+r6 ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm8
+ movhps [dstq+r6 ], xm9
lea dstq, [dstq+strideq*4]
jg .w4_loop
.w4_end:
@@ -220,19 +220,19 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
pshufb m3, m4
pmulhrsw m3, m5
paddw m3, m6
- vextracti32x4 xmm0, m3, 3
- vextracti32x4 xmm1, ym3, 1
- vextracti32x4 xmm2, m3, 2
- movhps [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
+ vextracti32x4 xm0, m3, 3
+ vextracti32x4 xm1, ym3, 1
+ vextracti32x4 xm2, m3, 2
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm3
add hq, 8
jg .end
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jl .w4_loop
@@ -337,20 +337,20 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
psubw m0, m6 ; left - right
pmulhrsw m0, m5
paddw m0, m6
- vextracti32x4 xmm1, m0, 2
- vextracti32x4 xmm2, ym0, 1
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
sub hd, 8*2
jl .end
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.end:
@@ -472,11 +472,11 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
vpdpwssd m0, m1, m6
vpermb m0, m14, m0
pavgw ym0, ym15
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
add v_weightsq, 4*4
sub hd, 4*2
@@ -624,11 +624,11 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
pmovzxbw ym0, [idxq]
add idxq, 16
vpermw ym0, ym0, ym3
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- movq [dstq+strideq*2], xmm1
- movhps [dstq+stride3q ], xmm1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm
index 050ec9bb253..38c86b54f5c 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm
@@ -242,9 +242,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
jmp wq
.w8:
movq xmm1, [tlq+1]
- vextracti32x4 xmm2, ym0, 1
+ vextracti32x4 xm2, ym0, 1
vpdpbusd xm0, xmm1, xm3
- paddd xmm2, xm0
+ paddd xmm2, xm2, xm0
punpckhqdq xmm0, xmm2, xmm2
paddd xmm0, xmm2
psrlq xmm1, xmm0, 32
@@ -275,9 +275,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
jmp wq
.w16:
movu xmm1, [tlq+1]
- vextracti32x4 xmm2, ym0, 1
+ vextracti32x4 xm2, ym0, 1
vpdpbusd xm0, xmm1, xm3
- paddd xmm2, xm0
+ paddd xmm2, xm2, xm0
punpckhqdq xmm0, xmm2, xmm2
paddd xmm0, xmm2
psrlq xmm1, xmm0, 32
@@ -309,8 +309,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
.w32:
movu ym1, [tlq+1]
vpdpbusd ym0, ym1, ym3
- vextracti32x4 xmm1, ym0, 1
- paddd xmm1, xm0
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
punpckhqdq xmm0, xmm1, xmm1
paddd xmm0, xmm1
psrlq xmm1, xmm0, 32
@@ -345,8 +345,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
movu ym2, [tlq+33]
vpdpbusd ym0, ym1, ym3
vpdpbusd ym0, ym2, ym3
- vextracti32x4 xmm1, ym0, 1
- paddd xmm1, xm0
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
punpckhqdq xmm0, xmm1, xmm1
paddd xmm0, xmm1
psrlq xmm1, xmm0, 32
@@ -524,12 +524,12 @@ INIT_YMM avx512icl
pextrd [dstq+stride3q ], xm0, 3
sub hd, 8
jl .w4_ret
- vextracti32x4 xmm0, m0, 1
+ vextracti32x4 xm0, m0, 1
lea dstq, [dstq+strideq*4]
- movd [dstq+strideq*0], xmm0
- pextrd [dstq+strideq*1], xmm0, 1
- pextrd [dstq+strideq*2], xmm0, 2
- pextrd [dstq+stride3q ], xmm0, 3
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm0, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.w4_ret:
@@ -545,20 +545,20 @@ INIT_ZMM avx512icl
vpbroadcastq m4, [tlq+hq-8]
pshufb m4, m9
PAETH
- vextracti32x4 xmm1, m0, 2
- vextracti32x4 xmm2, ym0, 1
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
sub hd, 8
jl .w8_ret
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jg .w8_loop
.w8_ret:
@@ -639,18 +639,18 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
pmaddubsw m0, m2, m0
paddw m0, m3
vpermb m0, m6, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
- movd [dstq+strideq*1], xmm1
+ movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+stride3q ], xmm1, 2
+ pextrd [dstq+stride3q ], xm1, 2
add hq, 8
jg .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
- pextrd [dstq+strideq*1], xmm1, 1
+ pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jl .w4_loop
.ret:
@@ -669,11 +669,11 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
pmaddubsw m0, m2, m0
paddw m0, m3
vpermb m0, m6, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w8_loop
@@ -785,18 +785,18 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
paddw m0, m2
paddw m0, m1
vpermb m0, m8, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
- movd [dstq+strideq*1], xmm1
+ movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+stride3q ], xmm1, 2
+ pextrd [dstq+stride3q ], xm1, 2
sub hd, 8
jl .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
- pextrd [dstq+strideq*1], xmm1, 1
+ pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.ret:
@@ -815,11 +815,11 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
paddw m0, m2
paddw m0, m1
vpermb m0, m8, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
@@ -937,18 +937,18 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
paddw m1, m2
pavgw m0, m1
vpermb m0, m11, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
- movd [dstq+strideq*1], xmm1
+ movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+stride3q ], xmm1, 2
+ pextrd [dstq+stride3q ], xm1, 2
sub hd, 8
jl .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
- pextrd [dstq+strideq*1], xmm1, 1
+ pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.ret:
@@ -978,11 +978,11 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
paddw m1, m2
pavgw m0, m1
vpermb m0, m11, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx.h b/chromium/third_party/dav1d/libdav1d/src/x86/itx.h
new file mode 100644
index 00000000000..46cfdb75d1d
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx.h
@@ -0,0 +1,356 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+#define decl_itx_fns(ext) \
+decl_itx17_fns( 4, 4, ext); \
+decl_itx16_fns( 4, 8, ext); \
+decl_itx16_fns( 4, 16, ext); \
+decl_itx16_fns( 8, 4, ext); \
+decl_itx16_fns( 8, 8, ext); \
+decl_itx16_fns( 8, 16, ext); \
+decl_itx2_fns ( 8, 32, ext); \
+decl_itx16_fns(16, 4, ext); \
+decl_itx16_fns(16, 8, ext); \
+decl_itx12_fns(16, 16, ext); \
+decl_itx2_fns (16, 32, ext); \
+decl_itx2_fns (32, 8, ext); \
+decl_itx2_fns (32, 16, ext); \
+decl_itx2_fns (32, 32, ext); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext))
+
+
+#define decl_itx2_bpc_fns(w, h, bpc, opt) \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt))
+
+#define decl_itx12_bpc_fns(w, h, bpc, opt) \
+decl_itx2_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt))
+
+#define decl_itx16_bpc_fns(w, h, bpc, opt) \
+decl_itx12_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt))
+
+#define decl_itx_bpc_fns(bpc, ext) \
+decl_itx16_bpc_fns( 4, 4, bpc, ext); \
+decl_itx16_bpc_fns( 4, 8, bpc, ext); \
+decl_itx16_bpc_fns( 4, 16, bpc, ext); \
+decl_itx16_bpc_fns( 8, 4, bpc, ext); \
+decl_itx16_bpc_fns( 8, 8, bpc, ext); \
+decl_itx16_bpc_fns( 8, 16, bpc, ext); \
+decl_itx2_bpc_fns ( 8, 32, bpc, ext); \
+decl_itx16_bpc_fns(16, 4, bpc, ext); \
+decl_itx16_bpc_fns(16, 8, bpc, ext); \
+decl_itx12_bpc_fns(16, 16, bpc, ext); \
+decl_itx2_bpc_fns (16, 32, bpc, ext); \
+decl_itx2_bpc_fns (32, 8, bpc, ext); \
+decl_itx2_bpc_fns (32, 16, bpc, ext); \
+decl_itx2_bpc_fns (32, 32, bpc, ext); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext))
+
+decl_itx_fns(avx512icl);
+decl_itx_bpc_fns(10, avx512icl);
+decl_itx_fns(avx2);
+decl_itx_bpc_fns(10, avx2);
+decl_itx_bpc_fns(12, avx2);
+decl_itx_fns(sse4);
+decl_itx_fns(ssse3);
+decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
+
+static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+
+#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
+
+#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext)
+
+#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext)
+
+#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext)
+
+#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn(, 4, 4, ssse3);
+ assign_itx16_fn(R, 4, 8, ssse3);
+ assign_itx16_fn(R, 8, 4, ssse3);
+ assign_itx16_fn(, 8, 8, ssse3);
+ assign_itx16_fn(R, 4, 16, ssse3);
+ assign_itx16_fn(R, 16, 4, ssse3);
+ assign_itx16_fn(R, 8, 16, ssse3);
+ assign_itx16_fn(R, 16, 8, ssse3);
+ assign_itx12_fn(, 16, 16, ssse3);
+ assign_itx2_fn (R, 8, 32, ssse3);
+ assign_itx2_fn (R, 32, 8, ssse3);
+ assign_itx2_fn (R, 16, 32, ssse3);
+ assign_itx2_fn (R, 32, 16, ssse3);
+ assign_itx2_fn (, 32, 32, ssse3);
+ assign_itx1_fn (R, 16, 64, ssse3);
+ assign_itx1_fn (R, 32, 64, ssse3);
+ assign_itx1_fn (R, 64, 16, ssse3);
+ assign_itx1_fn (R, 64, 32, ssse3);
+ assign_itx1_fn ( , 64, 64, ssse3);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+#if BITDEPTH == 16
+ if (bpc == 10) {
+ assign_itx16_fn(, 4, 4, sse4);
+ assign_itx16_fn(R, 4, 8, sse4);
+ assign_itx16_fn(R, 4, 16, sse4);
+ assign_itx16_fn(R, 8, 4, sse4);
+ assign_itx16_fn(, 8, 8, sse4);
+ assign_itx16_fn(R, 8, 16, sse4);
+ assign_itx16_fn(R, 16, 4, sse4);
+ assign_itx16_fn(R, 16, 8, sse4);
+ assign_itx12_fn(, 16, 16, sse4);
+ assign_itx2_fn (R, 8, 32, sse4);
+ assign_itx2_fn (R, 32, 8, sse4);
+ assign_itx2_fn (R, 16, 32, sse4);
+ assign_itx2_fn (R, 32, 16, sse4);
+ assign_itx2_fn (, 32, 32, sse4);
+ assign_itx1_fn (R, 16, 64, sse4);
+ assign_itx1_fn (R, 32, 64, sse4);
+ assign_itx1_fn (R, 64, 16, sse4);
+ assign_itx1_fn (R, 64, 32, sse4);
+ assign_itx1_fn (, 64, 64, sse4);
+ }
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx2);
+ assign_itx16_fn(R, 4, 8, avx2);
+ assign_itx16_fn(R, 4, 16, avx2);
+ assign_itx16_fn(R, 8, 4, avx2);
+ assign_itx16_fn( , 8, 8, avx2);
+ assign_itx16_fn(R, 8, 16, avx2);
+ assign_itx2_fn (R, 8, 32, avx2);
+ assign_itx16_fn(R, 16, 4, avx2);
+ assign_itx16_fn(R, 16, 8, avx2);
+ assign_itx12_fn( , 16, 16, avx2);
+ assign_itx2_fn (R, 16, 32, avx2);
+ assign_itx1_fn (R, 16, 64, avx2);
+ assign_itx2_fn (R, 32, 8, avx2);
+ assign_itx2_fn (R, 32, 16, avx2);
+ assign_itx2_fn ( , 32, 32, avx2);
+ assign_itx1_fn (R, 32, 64, avx2);
+ assign_itx1_fn (R, 64, 16, avx2);
+ assign_itx1_fn (R, 64, 32, avx2);
+ assign_itx1_fn ( , 64, 64, avx2);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 4, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 10, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 16, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 16, 64, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 16, 10, avx2);
+ assign_itx2_bpc_fn ( , 32, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 32, 64, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 16, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 32, 10, avx2);
+ assign_itx1_bpc_fn ( , 64, 64, 10, avx2);
+ } else {
+ assign_itx16_bpc_fn( , 4, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 12, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
+ }
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx512icl); // no wht
+ assign_itx16_fn(R, 4, 8, avx512icl);
+ assign_itx16_fn(R, 4, 16, avx512icl);
+ assign_itx16_fn(R, 8, 4, avx512icl);
+ assign_itx16_fn( , 8, 8, avx512icl);
+ assign_itx16_fn(R, 8, 16, avx512icl);
+ assign_itx2_fn (R, 8, 32, avx512icl);
+ assign_itx16_fn(R, 16, 4, avx512icl);
+ assign_itx16_fn(R, 16, 8, avx512icl);
+ assign_itx12_fn( , 16, 16, avx512icl);
+ assign_itx2_fn (R, 16, 32, avx512icl);
+ assign_itx1_fn (R, 16, 64, avx512icl);
+ assign_itx2_fn (R, 32, 8, avx512icl);
+ assign_itx2_fn (R, 32, 16, avx512icl);
+ assign_itx2_fn ( , 32, 32, avx512icl);
+ assign_itx1_fn (R, 32, 64, avx512icl);
+ assign_itx1_fn (R, 64, 16, avx512icl);
+ assign_itx1_fn (R, 64, 32, avx512icl);
+ assign_itx1_fn ( , 64, 64, avx512icl);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 8, 8, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl);
+ }
+#endif
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm
index c580944c7bb..811f711540f 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm
@@ -30,7 +30,6 @@
%if ARCH_X86_64
SECTION_RODATA 32
-pd_1321_2482: dd 1321, 1321, 1321, 1321, 2482, 2482, 2482, 2482
itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7
@@ -39,14 +38,17 @@ iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6
iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5
pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048
-iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11
idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
-%macro COEF_PAIR 2
+%macro COEF_PAIR 2-3 0
pd_%1_%2: dd %1, %1, %2, %2
%define pd_%1 (pd_%1_%2 + 4*0)
%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3
+dd -%2, -%2
+%define pd_%2_m%2 pd_%2
+%endif
%endmacro
COEF_PAIR 201, 995
@@ -56,8 +58,8 @@ COEF_PAIR 1380, 601
COEF_PAIR 1751, 2440
COEF_PAIR 2598, 1189
COEF_PAIR 2751, 2106
-COEF_PAIR 2896, 1567
-COEF_PAIR 2896, 3784
+COEF_PAIR 2896, 1567, 1
+COEF_PAIR 2896, 3784, 1
COEF_PAIR 3035, 3513
COEF_PAIR 3166, 3920
COEF_PAIR 3703, 3290
@@ -66,9 +68,6 @@ COEF_PAIR 4017, 2276
COEF_PAIR 4076, 3612
COEF_PAIR 4091, 3973
-%define pd_1321 (pd_1321_2482 + 4*0)
-%define pd_2482 (pd_1321_2482 + 4*4)
-
pd_8: dd 8
pd_m601: dd -601
pd_m1189: dd -1189
@@ -77,17 +76,23 @@ pd_m2106: dd -2106
pd_m2598: dd -2598
pd_m2751: dd -2751
pd_m3344: dd -3344
+pd_1024: dd 1024
+pd_1321: dd 1321
+pd_1448: dd 1448
+pd_1697: dd 1697
+pd_2482: dd 2482
+pd_3072: dd 3072 ; 1024 + 2048
pd_3803: dd 3803
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
pd_5793: dd 5793
pd_6144: dd 6144 ; 2048 + 4096
-pd_10239: dd 10239 ; 2048 + 8192 - 1
-pd_10240: dd 10240 ; 2048 + 8192
-pd_11586: dd 11586 ; 5793 * 2
-pd_34816: dd 34816 ; 2048 + 32768
-pd_38912: dd 38912 ; 2048 + 4096 + 32768
+pd_17408: dd 17408 ; 1024 + 16384
pixel_10bpc_max: times 2 dw 0x03ff
pixel_12bpc_max: times 2 dw 0x0fff
+dconly_10bpc: times 2 dw 0x7c00
+dconly_12bpc: times 2 dw 0x7000
clip_18b_min: dd -0x20000
clip_18b_max: dd 0x1ffff
clip_20b_min: dd -0x80000
@@ -214,7 +219,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
-; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2
+; flags: 1 = packed, 2 = inv_dst2
; skip round/shift if rnd is not a number
%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
%if %8 < 32
@@ -241,7 +246,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
pmulld m%1, m%5
pmulld m%2, m%5
%endif
-%if %9 & 4
+%if %9 & 2
psubd m%4, m%6, m%4
psubd m%2, m%4, m%2
%else
@@ -250,17 +255,10 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
%endif
paddd m%2, m%4
%endif
-%if %9 & 2 ; invert the upper half of dst1 before rounding
- vbroadcasti128 m%4, [pw_2048_m2048]
- psubd m%1, m%3
- psignd m%1, m%4
- paddd m%1, m%6
-%else
%ifnum %6
paddd m%1, m%6
%endif
psubd m%1, m%3
-%endif
%ifnum %6
psrad m%2, 12
psrad m%1, 12
@@ -287,37 +285,39 @@ ALIGN function_align
%endif
%endmacro
-%macro INV_TXFM_4X4_FN 2 ; type1, type2
- INV_TXFM_FN %1, %2, 0, 4x4
-%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
- movd xm1, [pw_2896x8]
- mov [cq], eobd ; 0
- add r6d, 2048
- sar r6d, 12
- movd xm0, r6d
- packssdw xm0, xm0
- pmulhrsw xm0, xm1
- vpbroadcastw xm0, xm0
- mova xm1, xm0
- jmp m(iadst_4x4_internal_10bpc).end
-%endif
-%endmacro
-
-%macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2
- INV_TXFM_FN %1, %2, 0, 4x4, 12
+%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x4, %3
%ifidn %1_%2, dct_dct
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
imul r6d, [cq], 181
mov [cq], eobd ; 0
+ or r3d, 4
+.dconly2:
add r6d, 128
sar r6d, 8
+.dconly3:
imul r6d, 181
- add r6d, 128
- sar r6d, 8
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
- vpbroadcastd m0, xm0
- mova m1, m0
- jmp m(iadst_4x4_internal_12bpc).end
+ paddsw xm0, xm2
+ vpbroadcastw xm0, xm0
+.dconly_loop:
+ movq xm1, [dstq+strideq*0]
+ movhps xm1, [dstq+strideq*1]
+ paddsw xm1, xm0
+ psubusw xm1, xm2
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ WRAP_XMM RET
+%else
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
+%endif
%endif
%endmacro
@@ -399,12 +399,50 @@ INV_TXFM_4X4_FN adst, adst
INV_TXFM_4X4_FN adst, flipadst
INV_TXFM_4X4_FN adst, identity
+%macro IADST4_1D 0
+ vpbroadcastd m5, [pd_1321]
+ vpbroadcastd m7, [pd_2482]
+ pmulld m4, m0, m5 ; 1321*in0
+ pmulld m6, m3, m7 ; 2482*in3
+ paddd m4, m6 ; 1321*in0 + 2482*in3
+ pmulld m6, m0, m7 ; 2482*in0
+ paddd m0, m3 ; in0 + in3
+ paddd m7, m5 ; pd_3803
+ pmulld m5, m2 ; 1321*in2
+ pmulld m3, m7 ; 3803*in3
+ pmulld m7, m2 ; 3803*in2
+ psubd m2, m0 ; in2 - in0 - in3
+ vpbroadcastd m0, [pd_m3344]
+ pmulld m1, m0 ; -t3
+ pmulld m2, m0 ; out2 (unrounded)
+ psubd m6, m5 ; 2482*in0 - 1321*in2
+ paddd m4, m7 ; t0
+ psubd m6, m3 ; t1
+ paddd m3, m4, m6
+ psubd m4, m1 ; out0 (unrounded)
+ psubd m6, m1 ; out1 (unrounded)
+ paddd m3, m1 ; out3 (unrounded)
+%endmacro
+
cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
call .main
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_2048]
+ mova m2, [itx4_shuf]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
packssdw m0, m1
- vpermd m0, m4, m0
- psrld m4, 4
- pshufb m0, m4
+ vpermd m0, m2, m0
+ psrld m2, 4
+ pshufb m0, m2
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
jmp tx2q
.pass2:
lea r6, [deint_shuf+128]
@@ -436,35 +474,16 @@ cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
RET
ALIGN function_align
.main:
- mova m2, [cq+16*2]
- vbroadcasti128 m5, [cq+16*0]
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+%if WIN64
+ movaps [rsp+16], xmm6
+ movaps [rsp+32], xmm7
+%endif
.main2:
- mova m0, [pd_1321_2482]
- vpbroadcastd m3, [pd_3803]
- vpbroadcastd m1, [pd_m3344]
- pmulld m4, m0, m2
- pmulld m3, m2
- pmulld m0, m5
- vpbroadcastd m5, [pd_2048]
- psubd xm2, [cq+16*3]
- psubd m2, [cq+16*0]
- pmulld m2, m1 ; t2 t3
- vpermq m4, m4, q1032
- paddd m4, m3
- psubd m0, m4
- paddd xm4, xm4
- paddd m4, m0 ; t0 t1
- vinserti128 m3, m2, xm4, 1 ; t2 t0
- paddd m0, m4, m5
- psubd xm4, xm2
- psubd m1, m0, m2
- vpermq m2, m2, q3232 ; t3 t3
- psubd m1, m4
- mova m4, [itx4_shuf]
- paddd m0, m2 ; out0 out1
- paddd m1, m3 ; out2 out3
- psrad m0, 12
- psrad m1, 12
+ WRAP_XMM IADST4_1D
ret
INV_TXFM_4X4_FN flipadst, dct
@@ -474,12 +493,9 @@ INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
- packssdw m0, m1
- psrld m1, m4, 8
- vpermd m0, m1, m0
- psrld m4, 4
- pshufb m0, m4
- jmp tx2q
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_10bpc).pass1_end
.pass2:
lea r6, [deint_shuf+128]
vextracti128 xm1, m0, 1
@@ -556,19 +572,20 @@ cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
movhps [r6 +strideq*1], xm1
RET
-INV_TXFM_4X4_12BPC_FN dct, dct
-INV_TXFM_4X4_12BPC_FN dct, identity
-INV_TXFM_4X4_12BPC_FN dct, adst
-INV_TXFM_4X4_12BPC_FN dct, flipadst
+INV_TXFM_4X4_FN dct, dct, 12
+INV_TXFM_4X4_FN dct, identity, 12
+INV_TXFM_4X4_FN dct, adst, 12
+INV_TXFM_4X4_FN dct, flipadst, 12
-cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
+cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(idct_4x4_internal_10bpc).main
mova m3, [idct4_12_shuf]
mova m4, [idct4_12_shuf2]
- vpermd m2, m3, m0
- vpermd m1, m4, m1
- jmp m(iadst_4x4_internal_12bpc).pass1_end
+ vpermd m2, m4, m1
+ vpermd m1, m3, m0
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
.pass2:
+ vpbroadcastd m5, [pd_2048]
vpermq m0, m0, q3120
vpermq m1, m1, q3120
call m(idct_4x4_internal_10bpc).main2
@@ -576,33 +593,52 @@ cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
vpermq m1, m1, q2031
jmp m(iadst_4x4_internal_12bpc).end
-INV_TXFM_4X4_12BPC_FN adst, dct
-INV_TXFM_4X4_12BPC_FN adst, adst
-INV_TXFM_4X4_12BPC_FN adst, flipadst
-INV_TXFM_4X4_12BPC_FN adst, identity
+INV_TXFM_4X4_FN adst, dct, 12
+INV_TXFM_4X4_FN adst, adst, 12
+INV_TXFM_4X4_FN adst, flipadst, 12
+INV_TXFM_4X4_FN adst, identity, 12
-cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
+cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
- vpermd m2, m4, m0
- vpermd m1, m4, m1
+ vinserti128 m1, m4, xm6, 1
+ vinserti128 m2, xm3, 1
.pass1_end:
- punpcklqdq m0, m2, m1
- punpckhqdq m1, m2, m1
+ mova m3, [itx4_shuf]
+ vpbroadcastd m5, [pd_1024]
+ psrad m1, 1
+ psrad m2, 1
+ vpermd m1, m3, m1
+ vpermd m2, m3, m2
+ paddd m1, m5
+ paddd m2, m5
+ psrad m1, 11
+ psrad m2, 11
.pass1_end2:
vpbroadcastd m3, [clip_18b_min]
vpbroadcastd m4, [clip_18b_max]
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
pmaxsd m0, m3
pmaxsd m1, m3
pminsd m0, m4
pminsd m1, m4
jmp tx2q
.pass2:
- mova [cq+16*0], m0
- vextracti128 [cq+16*3], m1, 1
- mova m2, m1
- vpermq m5, m0, q1010
- call m(iadst_4x4_internal_10bpc).main2
+ call .main_pass2
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass2_end:
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
.end:
+%if WIN64
+ WIN64_RESTORE_XMM_INTERNAL
+ %assign xmm_regs_used 6
+%endif
+.end2:
vpbroadcastd m4, [pw_16384]
movq xm2, [dstq+strideq*0]
movq xm3, [dstq+strideq*1]
@@ -627,53 +663,53 @@ cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
movhps [r6 +strideq*0], xm0
movhps [r6 +strideq*1], xm1
RET
+.main_pass2:
+ vextracti128 xm3, m1, 1
+ mova xm2, xm1
+ vextracti128 xm1, m0, 1
+ jmp m(iadst_4x4_internal_10bpc).main2
-INV_TXFM_4X4_12BPC_FN flipadst, dct
-INV_TXFM_4X4_12BPC_FN flipadst, adst
-INV_TXFM_4X4_12BPC_FN flipadst, flipadst
-INV_TXFM_4X4_12BPC_FN flipadst, identity
+INV_TXFM_4X4_FN flipadst, dct, 12
+INV_TXFM_4X4_FN flipadst, adst, 12
+INV_TXFM_4X4_FN flipadst, flipadst, 12
+INV_TXFM_4X4_FN flipadst, identity, 12
-cglobal iflipadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
+cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
- psrld m4, 8
- vpermd m2, m4, m0
- vpermd m1, m4, m1
- punpckhqdq m0, m1, m2
- punpcklqdq m1, m2
- jmp m(iadst_4x4_internal_12bpc).pass1_end2
+ vinserti128 m1, m3, xm2, 1
+ vinserti128 m2, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass1_end
.pass2:
- mova [cq+16*0], m0
- vextracti128 [cq+16*3], m1, 1
- mova m2, m1
- vpermq m5, m0, q1010
- call m(iadst_4x4_internal_10bpc).main2
- vpermq m2, m0, q1032
- vpermq m0, m1, q1032
- mova m1, m2
- jmp m(iadst_4x4_internal_12bpc).end
-
-INV_TXFM_4X4_12BPC_FN identity, dct
-INV_TXFM_4X4_12BPC_FN identity, adst
-INV_TXFM_4X4_12BPC_FN identity, flipadst
-INV_TXFM_4X4_12BPC_FN identity, identity
-
-cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
- vpbroadcastd m1, [pd_5793]
- pmulld m0, m1, [cq+32*0]
- pmulld m1, [cq+32*1]
+ call m(iadst_4x4_internal_12bpc).main_pass2
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass2_end
+
+INV_TXFM_4X4_FN identity, dct, 12
+INV_TXFM_4X4_FN identity, adst, 12
+INV_TXFM_4X4_FN identity, flipadst, 12
+INV_TXFM_4X4_FN identity, identity, 12
+
+cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ mova m2, [itx4_shuf]
+ vpbroadcastd m3, [pd_1697]
+ vpermd m0, m2, [cq+32*0]
+ vpermd m2, m2, [cq+32*1]
vpbroadcastd m5, [pd_2048]
- mova m3, [itx4_shuf]
- paddd m0, m5
+ pmulld m1, m3, m0
+ pmulld m3, m2
paddd m1, m5
- psrad m0, 12
+ paddd m3, m5
psrad m1, 12
- vpermd m2, m3, m0
- vpermd m1, m3, m1
- jmp m(iadst_4x4_internal_12bpc).pass1_end
+ psrad m3, 12
+ paddd m1, m0
+ paddd m2, m3
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
.pass2:
; m0 = in0 in1
; m1 = in2 in3
vpbroadcastd m3, [pd_5793]
+ vpbroadcastd m5, [pd_2048]
pmulld m0, m3
pmulld m1, m3
paddd m0, m5 ; 2048
@@ -685,34 +721,19 @@ cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 4x8, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 8
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 2048
- sar r6d, 12
-.end:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
- movd xm0, r6d
- vpbroadcastw xm0, xm0
- vpbroadcastd xm3, [pixel_%3bpc_max]
- pxor xm2, xm2
-.end_loop:
- movq xm1, [dstq+strideq*0]
- movhps xm1, [dstq+strideq*1]
- paddw xm1, xm0
- pmaxsw xm1, xm2
- pminsw xm1, xm3
- movq [dstq+strideq*0], xm1
- movhps [dstq+strideq*1], xm1
- lea dstq, [dstq+strideq*2]
- sub r3d, 2
- jg .end_loop
- WRAP_XMM RET
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
+%else
+ jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
+%endif
%endif
%endmacro
@@ -797,12 +818,14 @@ INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity
cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
-.pass1:
call m(iadst_8x4_internal_10bpc).main
- psrad m0, m4, 12
- psrad m1, m5, 12
- psrad m2, 12
- psrad m3, 12
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass1_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
jmp tx2q
.pass2:
call .pass2_main
@@ -918,13 +941,13 @@ INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity
cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
-.pass1:
call m(iadst_8x4_internal_10bpc).main
- psrad m0, m3, 12
- psrad m1, m2, 12
- psrad m2, m5, 12
- psrad m3, m4, 12
- jmp tx2q
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m2, m5, m6
+ paddd m3, m5, m4
+ jmp m(iadst_4x8_internal_10bpc).pass1_end
.pass2:
call m(iadst_4x8_internal_10bpc).pass2_main
mova xm4, [pw_2048_m2048]
@@ -1070,7 +1093,16 @@ INV_TXFM_4X8_FN adst, flipadst, 12
INV_TXFM_4X8_FN adst, identity, 12
cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
- jmp m(iadst_4x8_internal_10bpc).pass1
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m4, 1
+ psrad m1, m6, 1
+ psrad m2, 1
+ psrad m3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_1024]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 11}, m0, m1, m2, m3
+ jmp tx2q
.pass2:
vpbroadcastd m8, [clip_18b_min]
vpbroadcastd m9, [clip_18b_max]
@@ -1146,7 +1178,12 @@ INV_TXFM_4X8_FN flipadst, flipadst, 12
INV_TXFM_4X8_FN flipadst, identity, 12
cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
- jmp m(iflipadst_4x8_internal_10bpc).pass1
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m3, 1
+ psrad m1, m2, 1
+ psrad m2, m6, 1
+ psrad m3, m4, 1
+ jmp m(iadst_4x8_internal_12bpc).pass1_end
.pass2:
vpbroadcastd m8, [clip_18b_min]
vpbroadcastd m9, [clip_18b_max]
@@ -1180,12 +1217,13 @@ cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 4x16, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd xm2, [dconly_%3bpc]
mov [cq], eobd ; 0
- mov r3d, 16
- add r6d, 6144
- sar r6d, 13
- jmp m(inv_txfm_add_dct_dct_4x8_%3bpc).end
+ or r3d, 16
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
%endif
%endmacro
@@ -1196,7 +1234,7 @@ INV_TXFM_4X16_FN dct, flipadst
cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
.pass1:
- vpbroadcastd m10, [pd_6144]
+ vpbroadcastd m10, [pd_3072]
mova m1, [cq+32*2]
mova m3, [cq+32*6]
mova m5, [cq+32*3]
@@ -1241,7 +1279,7 @@ ALIGN function_align
vpbroadcastd m4, [pd_3784]
vpbroadcastd m8, [pd_1567]
vpbroadcastd m9, [pd_2048]
- vpbroadcastd m6, [pd_2896]
+ vpbroadcastd m6, [pd_1448]
ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
ret
@@ -1253,7 +1291,7 @@ ALIGN function_align
psubd m0, m2
paddd m9, m4, m6
psubd m4, m6
- REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
+ REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
psubd m2, m0, m1
paddd m1, m0
psubd m6, m4, m5
@@ -1304,7 +1342,6 @@ INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
-.pass1:
call m(iadst_16x4_internal_10bpc).main
vpbroadcastd m6, [pd_6144]
call m(iadst_16x4_internal_10bpc).main_end
@@ -1545,7 +1582,6 @@ INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
-.pass1:
vpbroadcastd m7, [pd_5793]
pmulld m0, m7, [cq+32*0]
pmulld m4, m7, [cq+32*1]
@@ -1678,7 +1714,16 @@ INV_TXFM_4X16_FN adst, flipadst, 12
INV_TXFM_4X16_FN adst, identity, 12
cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
- jmp m(iadst_4x16_internal_10bpc).pass1
+ call .main_pass1
+ psrad m0, m4, 12
+ psrad m1, m5, 12
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ psrad m6, 12
+ psrad m7, 12
+ jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@@ -1740,6 +1785,22 @@ ALIGN function_align
vperm2i128 m4, m8, m9, 0x20 ; 8 10
vperm2i128 m6, m8, m9, 0x31 ; 12 14
ret
+ALIGN function_align
+.main_pass1:
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_3072]
+ paddd m10, m4, m5
+ psubd m4, m3
+ psubd m5, m3
+ paddd m3, m10
+ psubd m8, m7, m1
+ paddd m7, m9
+ psubd m9, m1
+ paddd m7, m1
+ REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7
+ REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7
+ paddd m6, m0
+ ret
INV_TXFM_4X16_FN flipadst, dct, 12
INV_TXFM_4X16_FN flipadst, adst, 12
@@ -1747,7 +1808,16 @@ INV_TXFM_4X16_FN flipadst, flipadst, 12
INV_TXFM_4X16_FN flipadst, identity, 12
cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
- jmp m(iflipadst_4x16_internal_10bpc).pass1
+ call m(iadst_4x16_internal_12bpc).main_pass1
+ psrad m0, m3, 12
+ psrad m1, m2, 12
+ psrad m2, m5, 12
+ psrad m3, m4, 12
+ psrad m4, m7, 12
+ psrad m5, m6, 12
+ psrad m6, m9, 12
+ psrad m7, m8, 12
+ jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@@ -1772,17 +1842,49 @@ INV_TXFM_4X16_FN identity, flipadst, 12
INV_TXFM_4X16_FN identity, identity, 12
cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
- jmp m(iidentity_4x16_internal_10bpc).pass1
+ vpbroadcastd m8, [pd_1697]
+ mova m0, [cq+32*0]
+ mova m4, [cq+32*1]
+ mova m1, [cq+32*2]
+ mova m5, [cq+32*3]
+ vpbroadcastd m9, [pd_6144]
+ pmulld m2, m8, m0
+ pmulld m6, m8, m4
+ pmulld m3, m8, m1
+ pmulld m7, m8, m5
+ mova m10, [cq+32*4]
+ mova m11, [cq+32*5]
+ mova m12, [cq+32*6]
+ mova m13, [cq+32*7]
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m0, m2
+ pmulld m2, m8, m10
+ paddd m4, m6
+ pmulld m6, m8, m11
+ paddd m1, m3
+ pmulld m3, m8, m12
+ paddd m5, m7
+ pmulld m7, m8, m13
+ REPX {psrad x, 1 }, m0, m4, m1, m5
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m2, m10
+ paddd m6, m11
+ paddd m3, m12
+ paddd m7, m13
+ REPX {psrad x, 1 }, m2, m6, m3, m7
+ jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
- vpbroadcastd m8, [pd_11586]
- vpbroadcastd m9, [pd_2048]
+ vpbroadcastd m8, [pd_5793]
+ vpbroadcastd m9, [pd_1024]
REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
- REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
@@ -1795,37 +1897,21 @@ cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 8x4, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
- movd xm0, r6d
- vpbroadcastw m0, xm0
-.end:
- vpbroadcastd m4, [pixel_%3bpc_max]
- pxor m3, m3
- mova xm1, [dstq+strideq*0]
- vinserti128 m1, [dstq+strideq*1], 1
- lea r6, [dstq+strideq*2]
- mova xm2, [r6 +strideq*0]
- vinserti128 m2, [r6 +strideq*1], 1
- paddw m1, m0
- paddw m2, m0
- pmaxsw m1, m3
- pmaxsw m2, m3
- pminsw m1, m4
- pminsw m2, m4
- mova [dstq+strideq*0], xm1
- vextracti128 [dstq+strideq*1], m1, 1
- mova [r6 +strideq*0], xm2
- vextracti128 [r6 +strideq*1], m2, 1
- RET
+ or r3d, 4
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+%else
+ jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly
+%endif
%endif
%endmacro
@@ -1960,32 +2046,7 @@ ALIGN function_align
REPX {paddd x, m4}, m0, m3, m2, m1
REPX {psrad x, 12}, m0, m3, m2, m1
.main2:
- vbroadcasti128 m6, [pd_1321]
- vbroadcasti128 m7, [pd_2482]
- pmulld m4, m0, m6 ; 1321*in0
- pmulld m5, m3, m7 ; 2482*in3
- paddd m4, m5 ; 1321*in0 + 2482*in3
- pmulld m5, m0, m7 ; 2482*in0
- paddd m0, m3 ; in0 + in3
- paddd m7, m6 ; pd_3803
- pmulld m6, m2 ; 1321*in2
- pmulld m3, m7 ; 3803*in3
- pmulld m7, m2 ; 3803*in2
- psubd m2, m0 ; in2 - in0 - in3
- vpbroadcastd m0, [pd_m3344]
- psubd m5, m6 ; 2482*in0 - 1321*in2
- vpbroadcastd m6, [pd_2048]
- psubd m5, m3 ; t1
- pmulld m2, m0 ; t2
- pmulld m1, m0 ; -t3
- paddd m4, m7 ; t0
- paddd m5, m6
- paddd m3, m4, m5
- paddd m4, m6
- psubd m4, m1 ; out0 (unshifted)
- psubd m5, m1 ; out1 (unshifted)
- paddd m2, m6 ; out2 (unshifted)
- paddd m3, m1 ; out3 (unshifted)
+ IADST4_1D
ret
INV_TXFM_8X4_FN flipadst, dct
@@ -2103,10 +2164,13 @@ cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
REPX {pmaxsd x, m8}, m0, m1, m2, m3
REPX {pminsd x, m9}, m0, m1, m2, m3
call .pass2_main
- psrad m0, m4, 12
- psrad m1, m5, 12
- psrad m2, 12
- psrad m3, 12
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass2_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
.end:
vpbroadcastd m4, [pw_16384]
REPX {psrad x, 3}, m0, m1, m2, m3
@@ -2162,11 +2226,12 @@ cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2
REPX {pmaxsd x, m8}, m0, m1, m2, m3
REPX {pminsd x, m9}, m0, m1, m2, m3
call m(iadst_8x4_internal_12bpc).pass2_main
- psrad m0, m3, 12
- psrad m3, m4, 12
- psrad m1, m2, 12
- psrad m2, m5, 12
- jmp m(iadst_8x4_internal_12bpc).end
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m3, m5, m4
+ paddd m2, m5, m6
+ jmp m(iadst_8x4_internal_12bpc).pass2_end
INV_TXFM_8X4_FN identity, dct, 12
INV_TXFM_8X4_FN identity, adst, 12
@@ -2197,32 +2262,36 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 8x8, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
- mov [cq], eobd ; 0
- mov r3d, 8
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
.dconly:
- add r6d, 6144
- sar r6d, 13
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
+ paddsw xm0, xm2
vpbroadcastw m0, xm0
- vpbroadcastd m3, [pixel_%3bpc_max]
- pxor m2, m2
.dconly_loop:
mova xm1, [dstq+strideq*0]
vinserti128 m1, [dstq+strideq*1], 1
- paddw m1, m0
- pmaxsw m1, m2
- pminsw m1, m3
+ paddsw m1, m0
+ psubusw m1, m2
mova [dstq+strideq*0], xm1
vextracti128 [dstq+strideq*1], m1, 1
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .dconly_loop
RET
+%else
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
%endif
%endmacro
@@ -2245,7 +2314,7 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a
psubd m%10, m%7, m%9 ; t7
paddd m%7, m%9 ; out6
- vpbroadcastd m%9, [pd_2896]
+ vpbroadcastd m%9, [pd_1448]
psubd m%4, m%8, m%6 ; t3
paddd m%8, m%6 ; -out7
psubd m%6, m%1, m%3 ; t2
@@ -2255,10 +2324,10 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10
REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10
REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10
- psubd m%5, m%6, m%4 ; (t2 - t3) * 2896
- paddd m%4, m%6 ; (t2 + t3) * 2896
- psubd m%6, m%3, m%10 ; (t6 - t7) * 2896
- paddd m%3, m%10 ; (t6 + t7) * 2896
+ psubd m%5, m%6, m%4 ; (t2 - t3) * 1448
+ paddd m%4, m%6 ; (t2 + t3) * 1448
+ psubd m%6, m%3, m%10 ; (t6 - t7) * 1448
+ paddd m%3, m%10 ; (t6 + t7) * 1448
%endmacro
INV_TXFM_8X8_FN dct, dct
@@ -2430,8 +2499,8 @@ ALIGN function_align
vpbroadcastd m11, [pd_2048]
.main2:
IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
- psrld m8, 11 ; pd_1
- vpbroadcastd m9, [pd_6144]
+ psrld m8, 10 ; pd_1
+ vpbroadcastd m9, [pd_3072]
ret
ALIGN function_align
.main_end:
@@ -2440,14 +2509,14 @@ ALIGN function_align
paddd m6, m8
psubd m7, m8, m7
REPX {psrad x, 1 }, m0, m1, m6, m7
- ; (1 + ((x + 2048) >> 12)) >> 1 = (6144 + x) >> 13
- ; (1 - ((x + 2048) >> 12)) >> 1 = (6143 - x) >> 13
- psubd m8, m9, m8 ; pd_6143
+ ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12
+ ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12
+ psubd m8, m9, m8 ; pd_3071
paddd m2, m9
psubd m3, m8, m3
paddd m4, m9
psubd m5, m8, m5
- REPX {psrad x, 13}, m2, m3, m4, m5
+ REPX {psrad x, 12}, m2, m3, m4, m5
ret
INV_TXFM_8X8_FN flipadst, dct
@@ -2496,10 +2565,10 @@ ALIGN function_align
paddd m5, m9, m2
psubd m2, m8, m3
paddd m3, m9, m4
- psrad m4, m2, 13
- psrad m2, m10, 13
- psrad m3, 13
- psrad m5, 13
+ psrad m4, m2, 12
+ psrad m2, m10, 12
+ psrad m3, 12
+ psrad m5, 12
ret
INV_TXFM_8X8_FN identity, dct
@@ -2681,13 +2750,13 @@ ALIGN function_align
paddd m6, m9
psubd m7, m9, m7
REPX {psrad x, 4}, m0, m1, m6, m7
- vpbroadcastd m9, [pd_34816]
- psubd m8, m9, m8 ; 34815
+ vpbroadcastd m9, [pd_17408]
+ psubd m8, m9, m8 ; 17407
paddd m2, m9
psubd m3, m8, m3
paddd m4, m9
psubd m5, m8, m5
- REPX {psrad x, 16}, m2, m3, m4, m5
+ REPX {psrad x, 15}, m2, m3, m4, m5
ret
INV_TXFM_8X8_FN flipadst, dct, 12
@@ -2729,13 +2798,14 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
INV_TXFM_FN %1, %2, %3, 8x16, %4
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_%4bpc]
mov [cq], eobd ; 0
- mov r3d, 16
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- jmp m(inv_txfm_add_dct_dct_8x8_%4bpc).dconly
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
%endif
%endmacro
@@ -2904,7 +2974,7 @@ ALIGN function_align
vpbroadcastd m15, [pd_3784]
vpbroadcastd m10, [pd_1567]
ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
- ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4
+ ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2
psubd m3, m1, m4 ; t10
paddd m1, m4 ; t9
psubd m4, m0, m2 ; t11a
@@ -3269,7 +3339,7 @@ cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
call m(iadst_16x8_internal_10bpc).pass1_rotations
.pass2_end:
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
jmp m(idct_8x16_internal_12bpc).end
ALIGN function_align
.pass2_main:
@@ -3302,9 +3372,9 @@ ALIGN function_align
pmaxsd m7, m13, [cq+32* 3] ; 3
REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
call m(iadst_16x8_internal_10bpc).main_part2
- vpbroadcastd m14, [pd_34816]
+ vpbroadcastd m14, [pd_17408]
psrld m15, 11 ; pd_1
- psubd m13, m14, m15 ; pd_34815
+ psubd m13, m14, m15 ; pd_17407
pslld m15, 3 ; pd_8
ret
@@ -3357,49 +3427,52 @@ ALIGN function_align
m8, m9, m10, m11, m12, m13, m14
pminsd m15, [cq]
mova [cq], m7
- vpbroadcastd m7, [pd_11586]
+ vpbroadcastd m7, [pd_5793]
REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
pmulld m7, [cq]
mova [cq], m15
- vpbroadcastd m15, [pd_2048]
+ vpbroadcastd m15, [pd_1024]
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
paddd m15, [cq]
- REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
ret
%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 16x4, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
- mov [cq], eobd ; 0
- mov r3d, 4
+ vpbroadcastd m3, [dconly_%3bpc]
+%if %3 = 10
.dconly:
- add r6d, 6144
- sar r6d, 13
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
+ paddsw xm0, xm3
vpbroadcastw m0, xm0
- vpbroadcastd m4, [pixel_%3bpc_max]
- pxor m3, m3
.dconly_loop:
- paddw m1, m0, [dstq+strideq*0]
- paddw m2, m0, [dstq+strideq*1]
- pmaxsw m1, m3
- pmaxsw m2, m3
- pminsw m1, m4
- pminsw m2, m4
+ paddsw m1, m0, [dstq+strideq*0]
+ paddsw m2, m0, [dstq+strideq*1]
+ psubusw m1, m3
+ psubusw m2, m3
mova [dstq+strideq*0], m1
mova [dstq+strideq*1], m2
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .dconly_loop
RET
+%else
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
+%endif
%endif
%endmacro
@@ -3480,13 +3553,30 @@ ALIGN function_align
.pass1_main2:
ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1
ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
- psubd m4, m10, m5 ; t9 -t10
+ vbroadcasti128 m12, [pd_3784_m3784]
+ psubd m4, m10, m5
paddd m10, m5 ; t8 t11
- psubd m5, m11, m6 ; t14 -t13
+ psignd m4, m12 ; t9 t10
+ psubd m5, m11, m6
paddd m11, m6 ; t15 t12
- REPX {pmaxsd x, m8}, m4, m5, m10, m11
- REPX {pminsd x, m9}, m4, m5, m10, m11
- ITX_MULSUB_2D 5, 4, 6, 12, 13, 7, 1567, 3784, 2
+ psignd m5, m12 ; t14 t13
+ vpbroadcastd m6, [pd_1567]
+ vpbroadcastd m13, [pd_3784]
+ REPX {pmaxsd x, m8}, m5, m4
+ REPX {pminsd x, m9}, m5, m4
+ pmulld m12, m5
+ pmulld m5, m6
+ vbroadcasti128 m6, [pd_1567_m1567]
+ pmulld m13, m4
+ pmulld m4, m6
+ REPX {pmaxsd x, m8}, m10, m11, m0, m1
+ REPX {pminsd x, m9}, m10, m11, m0, m1
+ paddd m12, m7
+ paddd m5, m7
+ paddd m4, m12
+ psubd m5, m13
+ psrad m4, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
vpbroadcastd m12, [pd_2896]
punpckhqdq m6, m11, m5
punpcklqdq m11, m4
@@ -3500,8 +3590,8 @@ ALIGN function_align
REPX {pminsd x, m9}, m5, m6
pmulld m5, m12
pmulld m6, m12
- REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10
- REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10
+ REPX {pmaxsd x, m8}, m2, m3, m11, m10
+ REPX {pminsd x, m9}, m2, m3, m11, m10
ret
ALIGN function_align
.pass1_main3:
@@ -3565,10 +3655,10 @@ cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
jmp m(idct_16x4_internal_10bpc).end
ALIGN function_align
.main:
- vbroadcasti128 m6, [pd_1321]
+ vpbroadcastd m6, [pd_1321]
mova m0, [cq+32*0]
mova m1, [cq+32*1]
- vbroadcasti128 m7, [pd_2482]
+ vpbroadcastd m7, [pd_2482]
mova m2, [cq+32*6]
mova m3, [cq+32*7]
pmulld m4, m0, m6
@@ -3663,8 +3753,7 @@ INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
-.pass1:
- vpbroadcastd m8, [pd_11586]
+ vpbroadcastd m8, [pd_5793]
vpermq m0, [cq+32*0], q3120 ; 0 1
vpermq m1, [cq+32*1], q3120 ; 2 3
vpermq m2, [cq+32*2], q3120 ; 4 5
@@ -3673,10 +3762,10 @@ cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
vpermq m5, [cq+32*5], q3120 ; a b
vpermq m6, [cq+32*6], q3120 ; c d
vpermq m7, [cq+32*7], q3120 ; e f
- vpbroadcastd m9, [pd_6144]
+ vpbroadcastd m9, [pd_3072]
REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
- REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
call m(idct_16x4_internal_10bpc).transpose_4x16_packed
@@ -3729,17 +3818,15 @@ cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
pmulld m2, m6, m11
pmulld m4, m6, m12
pmulld m6, m13
- vpbroadcastd m10, [pd_2048]
+ vpbroadcastd m10, [pd_17408]
call m(idct_4x16_internal_10bpc).pass1_main2
- REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
- vpbroadcastd m4, [pw_16384]
vpbroadcastd m5, [pixel_12bpc_max]
REPX {vpermq x, x, q3120}, m0, m1, m2, m3
- REPX {pmulhrsw x, m4}, m0, m1, m2, m3
jmp m(idct_16x4_internal_10bpc).end2
INV_TXFM_16X4_FN adst, dct, 12
@@ -3824,7 +3911,37 @@ INV_TXFM_16X4_FN identity, flipadst, 12
INV_TXFM_16X4_FN identity, identity, 12
cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
- jmp m(iidentity_16x4_internal_10bpc).pass1
+ vpbroadcastd m8, [pd_1697]
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpbroadcastd m9, [pd_3072]
+ pmulld m4, m8, m0
+ pmulld m5, m8, m1
+ pmulld m6, m8, m2
+ pmulld m7, m8, m3
+ vpermq m10, [cq+32*4], q3120 ; 8 9
+ vpermq m11, [cq+32*5], q3120 ; a b
+ vpermq m12, [cq+32*6], q3120 ; c d
+ vpermq m13, [cq+32*7], q3120 ; e f
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m0, m4
+ pmulld m4, m8, m10
+ paddd m1, m5
+ pmulld m5, m8, m11
+ paddd m2, m6
+ pmulld m6, m8, m12
+ paddd m3, m7
+ pmulld m7, m8, m13
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m4, m10
+ paddd m5, m11
+ paddd m6, m12
+ paddd m7, m13
+ jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@@ -3844,13 +3961,14 @@ cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 16x8, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%3bpc]
mov [cq], eobd ; 0
- mov r3d, 8
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- jmp m(inv_txfm_add_dct_dct_16x4_%3bpc).dconly
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
%endif
%endmacro
@@ -4013,13 +4131,13 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
.pass1:
lea r6, [rsp+32*4]
call .main
- vpbroadcastd m14, [pd_6144]
+ vpbroadcastd m14, [pd_3072]
psrld m15, 11 ; pd_1
- psubd m13, m14, m15 ; pd_6143
+ psubd m13, m14, m15 ; pd_3071
call .pass1_rotations
.pass1_end:
REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11
jmp tx2q
.pass2:
call m(idct_16x8_internal_10bpc).transpose
@@ -4127,8 +4245,6 @@ ALIGN function_align
pmaxsd m10, m13
pminsd m9, m14
pminsd m10, m14
- pmulld m9, m15
- pmulld m10, m15
mova [r6-32*4], m1
mova m11, [r6-32*1] ; t7a
mova m1, [r6-32*2] ; t6a
@@ -4140,7 +4256,6 @@ ALIGN function_align
pmaxsd m2, m13
pminsd m8, m14
pminsd m2, m14
- pmulld m8, m15
mova [r6-32*1], m11
mova [r6-32*3], m2
mova m1, [r6+32*3] ; t15
@@ -4153,8 +4268,6 @@ ALIGN function_align
pmaxsd m11, m13
pminsd m7, m14
pminsd m11, m14
- pmulld m7, m15
- pmulld m11, m15
mova [r6-32*2], m12
pminsd m1, m14, [r6+32*0] ; t10a
pminsd m12, m14, [r6+32*1] ; t11a
@@ -4162,13 +4275,13 @@ ALIGN function_align
paddd m1, m4 ; -out1
psubd m4, m5, m12 ; t11
paddd m5, m12 ; out14
- pmulld m12, m15, [r6-32*3] ; t6
+ vpbroadcastd m12, [pd_1448]
pmaxsd m6, m13
pmaxsd m4, m13
pminsd m6, m14
pminsd m4, m14
- pmulld m6, m15
- pmulld m4, m15
+ REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4
+ pmulld m12, [r6-32*3] ; t6
mova [r6-32*3], m5
paddd m5, m11, m7 ; -out5 (unshifted)
psubd m11, m7 ; out10 (unshifted)
@@ -4233,7 +4346,7 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
.pass1:
lea r6, [rsp+32*4]
call m(iadst_16x8_internal_10bpc).main
- vpbroadcastd m14, [pd_6144]
+ vpbroadcastd m14, [pd_3072]
psrld m15, 11
psubd m13, m14, m15
call .pass1_rotations
@@ -4313,16 +4426,16 @@ cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
mova [rsp], m15
- vpbroadcastd m15, [pd_11586]
+ vpbroadcastd m15, [pd_5793]
REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
pmulld m15, [rsp]
mova [rsp], m7
- vpbroadcastd m7, [pd_6144]
+ vpbroadcastd m7, [pd_3072]
REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
paddd m7, [rsp]
- REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
@@ -4340,6 +4453,10 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpbroadcastd m13, [clip_20b_max]
jmp m(idct_16x8_internal_10bpc).pass1
.pass2:
+ call .pass2_main
+ RET
+ALIGN function_align
+.pass2_main:
call m(idct_8x16_internal_12bpc).transpose
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@@ -4383,8 +4500,7 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpermq m1, m5, q3120
vpermq m2, m6, q3120
vpermq m3, m7, q3120
- call m(idct_16x8_internal_10bpc).write_16x4_zero
- RET
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
ALIGN function_align
.write_16x4_start:
vpbroadcastd m9, [pixel_12bpc_max]
@@ -4403,7 +4519,8 @@ cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
jmp m(iadst_16x8_internal_10bpc).pass1
.pass2:
call .pass2_main
- jmp m(idct_16x8_internal_12bpc).end
+ call m(idct_16x8_internal_12bpc).end
+ RET
ALIGN function_align
.pass2_main:
call m(idct_8x16_internal_12bpc).transpose
@@ -4483,12 +4600,13 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
INV_TXFM_FN %1, %2, %3, 16x16, %4
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%4bpc]
mov [cq], eobd ; 0
- mov r3d, 16
- add r6d, 10240
- sar r6d, 14
- jmp m(inv_txfm_add_dct_dct_16x4_%4bpc).dconly2
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
%endif
%endmacro
@@ -4756,17 +4874,17 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
add cq, 32
call .main
sub cq, 32
- vpbroadcastd m8, [pd_10240]
+ vpbroadcastd m8, [pd_5120]
paddd m4, m8
paddd m6, m8
paddd m9, m8
paddd m11, m8
- vpbroadcastd m8, [pd_10239]
+ vpbroadcastd m8, [pd_5119]
psubd m5, m8, m5
psubd m7, m8, m7
psubd m10, m8, m10
psubd m12, m8, m12
- REPX {psrad x, 14}, m4, m5, m6, m7, m9, m10, m11, m12
+ REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12
mova [r6+32*0], m4
mova [r6+32*1], m5
mova [r6+32*2], m6
@@ -4797,8 +4915,8 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
.fast:
add r6, 32*8
call .main
- vpbroadcastd m14, [pd_10240]
- vpbroadcastd m13, [pd_10239]
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
psrld m15, 10 ; pd_2
paddd m0, m15
psubd m1, m15, m1
@@ -4818,7 +4936,7 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
psubd m15, [r6-32*4]
.pass1_end:
REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
sub r6, 32*8
jmp tx2q
.pass2:
@@ -4892,17 +5010,17 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
add cq, 32
call m(iadst_16x16_internal_10bpc).main
sub cq, 32
- vpbroadcastd m8, [pd_10240]
+ vpbroadcastd m8, [pd_5120]
paddd m11, m8
paddd m9, m8
paddd m6, m8
paddd m4, m8
- vpbroadcastd m8, [pd_10239]
+ vpbroadcastd m8, [pd_5119]
psubd m12, m8, m12
psubd m10, m8, m10
psubd m7, m8, m7
psubd m5, m8, m5
- REPX {psrad x, 14}, m12, m11, m10, m9, m7, m6, m5, m4
+ REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4
mova [r6+32*0], m12
mova [r6+32*1], m11
mova [r6+32*2], m10
@@ -4933,8 +5051,8 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
.fast:
add r6, 32*8
call m(iadst_16x16_internal_10bpc).main
- vpbroadcastd m14, [pd_10240]
- vpbroadcastd m13, [pd_10239]
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
psrld m15, 10 ; pd_2
psubd m8, m13, m7
paddd m7, m14, m9
@@ -4996,9 +5114,8 @@ INV_TXFM_16X16_FN identity, dct, -92
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
-.pass1:
- vpbroadcastd m15, [pd_11586]
- vpbroadcastd m7, [pd_10240]
+ vpbroadcastd m15, [pd_5793]
+ vpbroadcastd m7, [pd_5120]
lea r6, [rsp+32*4]
sub eobd, 36
jl .fast
@@ -5010,7 +5127,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
pmulld m3, m15, [cq+r3+32*39]
add r6, 32*4
REPX {paddd x, m7}, m0, m1, m2, m3
- REPX {psrad x, 14}, m0, m1, m2, m3
+ REPX {psrad x, 13}, m0, m1, m2, m3
mova [r6+32*0], m0
mova [r6+32*1], m1
mova [r6+32*2], m2
@@ -5038,7 +5155,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
paddd m7, [cq]
- REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
@@ -5203,7 +5320,7 @@ cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
call m(iadst_16x8_internal_10bpc).pass1_rotations
.pass2_part3:
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
.end:
packssdw m15, m14
packssdw m14, m13, m12
@@ -5320,15 +5437,15 @@ ALIGN function_align
REPX {pminsd x, m14}, m1, m3, m4, m6
.pass2_fast2:
call m(iadst_16x8_internal_10bpc).main_part2
- vpbroadcastd m14, [pd_34816]
+ vpbroadcastd m14, [pd_17408]
psrld m15, 11 ; pd_1
- psubd m13, m14, m15 ; pd_34815
+ psubd m13, m14, m15 ; pd_17407
pslld m15, 3 ; pd_8
ret
ALIGN function_align
.pass2_part2:
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
packssdw m0, m1
packssdw m1, m2, m3
packssdw m2, m4, m5
@@ -5375,8 +5492,73 @@ cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
INV_TXFM_16X16_FN identity, dct, -92, 12
INV_TXFM_16X16_FN identity, identity, 0, 12
+%macro IDTX16_12BPC 1 ; src
+ pmulld m6, m7, m%1
+ paddd m6, m15
+ psrad m6, 12
+ paddd m6, m%1
+ psrad m%1, m6, 1
+%endmacro
+
cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
- jmp m(iidentity_16x16_internal_10bpc).pass1
+ vpbroadcastd m7, [pd_1697]
+ vpbroadcastd m15, [pd_5120]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ mov r3, -32*8*4
+.righthalf:
+ mova m10, [cq+r3+32*33]
+ mova m11, [cq+r3+32*35]
+ mova m12, [cq+r3+32*37]
+ mova m13, [cq+r3+32*39]
+ add r6, 32*4
+ pmulld m0, m7, m10
+ pmulld m1, m7, m11
+ pmulld m2, m7, m12
+ pmulld m3, m7, m13
+ REPX {paddd x, m15}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ paddd m0, m10
+ paddd m1, m11
+ paddd m2, m12
+ paddd m3, m13
+ REPX {psrad x, 1 }, m0, m1, m2, m3
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ add r3, 32*8
+ jl .righthalf
+.fast:
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 1]
+ mova m2, [cq+64* 2]
+ mova m3, [cq+64* 3]
+ mova m4, [cq+64* 4]
+ mova m5, [cq+64* 5]
+ mova m8, [cq+64* 6]
+ mova m9, [cq+64* 7]
+ REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9
+ mova [cq+64*0], m8
+ mova [cq+64*1], m9
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64* 9]
+ mova m10, [cq+64*10]
+ mova m11, [cq+64*11]
+ mova m12, [cq+64*12]
+ mova m13, [cq+64*13]
+ mova m14, [cq+64*14]
+ REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14
+ mova m6, [cq+64*15]
+ pmulld m7, m6
+ paddd m7, m15
+ psrad m7, 12
+ paddd m7, m6
+ mova m6, [cq+64*0]
+ psrad m15, m7, 1
+ mova m7, [cq+64*1]
+ jmp tx2q
.pass2:
call m(iidentity_8x16_internal_12bpc).pass2_main
call m(idct_16x16_internal_10bpc).transpose_fast
@@ -5429,7 +5611,7 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
call m(idct_16x16_internal_12bpc).write_16x16
RET
-%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift
+%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack
mova m%4, [r6+32*(%1-4)]
mova m%2, [r5+32*(3-%1)]
mova m%5, [r4+32*(%1-4)]
@@ -5446,8 +5628,10 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
paddd m%2, m%3, m%5 ; out15 - n
psubd m%3, m%5 ; out16 + n
REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%if %7 & 1
packssdw m%1, m%3 ; out0 + n, out16 + n
packssdw m%2, m%4 ; out15 - n, out31 - n
+%endif
%endmacro
cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
@@ -5574,14 +5758,15 @@ cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
call m(idct_8x8_internal_10bpc).write_8x4
RET
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 32
- add r6d, 10240
- sar r6d, 14
- jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
ALIGN function_align
-.pass1_main:
+.pass1_main_part1:
mova m0, [cq+128*0]
mova m1, [cq+128*1]
mova m2, [cq+128*2]
@@ -5590,7 +5775,6 @@ ALIGN function_align
mova m5, [cq+128*5]
mova m6, [cq+128*6]
mova m7, [cq+128*7]
- add cq, 32
call m(idct_8x8_internal_10bpc).main
psrld m1, m11, 10 ; pd_2
REPX {paddd x, m1}, m0, m6, m5, m3
@@ -5603,6 +5787,11 @@ ALIGN function_align
psubd m4, m3, m8 ; out4
paddd m3, m8 ; out3
REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.pass1_main:
+ call .pass1_main_part1
+ add cq, 32
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
@@ -5665,7 +5854,7 @@ ALIGN function_align
vpbroadcastd m15, [pd_4017]
vpbroadcastd m10, [pd_799]
ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
- ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
+ ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a
psubd m3, m0, m6 ; t19a
paddd m0, m6 ; t16a
psubd m6, m7, m1 ; t28a
@@ -5734,7 +5923,7 @@ ALIGN function_align
vpbroadcastd m15, [pd_2276]
vpbroadcastd m10, [pd_3406]
ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
- ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
+ ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a
psubd m3, m0, m6 ; t27a
paddd m0, m6 ; t24a
psubd m6, m7, m1 ; t20a
@@ -5747,8 +5936,8 @@ ALIGN function_align
REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
vpbroadcastd m15, [pd_3784]
vpbroadcastd m10, [pd_1567]
- ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
- ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20
mova m9, [r6-32*4] ; t16a
mova m10, [r6-32*3] ; t17
psubd m2, m9, m7 ; t23
@@ -5881,8 +6070,9 @@ ALIGN function_align
ret
cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob
- vpbroadcastd m5, [pw_5]
vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_5]
pxor m6, m6
mov r6d, eobd
add eobb, 21
@@ -5947,30 +6137,262 @@ ALIGN function_align
vextracti128 [dstq+r4 ], m3, 1
ret
+cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ mov r4, cq
+ lea r6, [rsp+32*4]
+ call .pass1_main
+ cmp eobd, 43
+ jge .eob43
+ jmp .pass2_fast
+.eob43:
+ call .pass1_main
+ cmp eobd, 107
+ jge .eob107
+.pass2_fast:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ jmp .pass2_end
+.eob107:
+ call .pass1_main
+ cmp eobd, 171
+ jge .eob171
+ jmp .pass2
+.eob171:
+ call .pass1_main
+.pass2:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ pmaxsd m4, m12, [cq+128*1+64]
+ pmaxsd m5, m12, [cq+128*7+64]
+ pmaxsd m6, m12, [cq+128*1+96]
+ pmaxsd m7, m12, [cq+128*7+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ pmaxsd m4, m12, [cq+128*3+64]
+ pmaxsd m5, m12, [cq+128*5+64]
+ pmaxsd m6, m12, [cq+128*3+96]
+ pmaxsd m7, m12, [cq+128*5+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ pmaxsd m4, m12, [cq+128*2+64]
+ pmaxsd m5, m12, [cq+128*6+64]
+ pmaxsd m6, m12, [cq+128*2+96]
+ pmaxsd m7, m12, [cq+128*6+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ pmaxsd m4, m12, [cq+128*0+64]
+ pmaxsd m5, m12, [cq+128*4+64]
+ pmaxsd m6, m12, [cq+128*0+96]
+ pmaxsd m7, m12, [cq+128*4+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+.pass2_end:
+ psrld m11, 8 ; pd_8
+ IDCT32_END 0, 15, 8, 9, 10, 4
+ IDCT32_END 1, 14, 8, 9, 10, 4
+ punpckhqdq m8, m0, m1 ; 16 17 (interleaved)
+ punpcklqdq m0, m1 ; 0 1 (interleaved)
+ punpcklqdq m1, m14, m15 ; 14 15 (interleaved)
+ punpckhqdq m14, m15 ; 30 31 (interleaved)
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 4
+ IDCT32_END 3, 14, 8, 9, 10, 4
+ punpckhqdq m8, m2, m3 ; 18 19 (interleaved)
+ punpcklqdq m2, m3 ; 2 3 (interleaved)
+ punpcklqdq m3, m14, m15 ; 12 13 (interleaved)
+ punpckhqdq m14, m15 ; 28 29 (interleaved)
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 4
+ IDCT32_END 5, 14, 8, 9, 10, 4
+ punpckhqdq m8, m4, m5 ; 20 21 (interleaved)
+ punpcklqdq m4, m5 ; 4 5 (interleaved)
+ punpcklqdq m5, m14, m15 ; 10 11 (interleaved)
+ punpckhqdq m14, m15 ; 26 27 (interleaved)
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 4
+ IDCT32_END 7, 14, 8, 9, 10, 4
+ punpckhqdq m8, m6, m7 ; 22 23 (interleaved)
+ punpcklqdq m6, m7 ; 6 7 (interleaved)
+ punpcklqdq m7, m14, m15 ; 8 9 (interleaved)
+ punpckhqdq m14, m15 ; 24 25 (interleaved)
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+ mova m15, m1
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m2, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m6, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m7, q3120
+ vpermq m1, m5, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m3, q3120
+ vpermq m1, m15, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*3], q3120
+ vpermq m1, [r5+32*1], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*1], q3120
+ vpermq m1, [r5-32*3], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*4], q3120
+ vpermq m1, [r5-32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*0], q3120
+ vpermq m1, [r5+32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+ALIGN function_align
+.pass1_main:
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1
+ TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 32
+ ret
+ALIGN function_align
+.main_end:
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2, 0
+ mova [cq+32*16], m8
+ mova [cq+32*31], m9
+ IDCT32_END 1, 14, 8, 9, 10, 2, 0
+ mova [cq+32*17], m8
+ mova [cq+32*30], m9
+ mova [cq+32*14], m14
+ IDCT32_END 2, 14, 8, 9, 10, 2, 0
+ mova [cq+32*18], m8
+ mova [cq+32*29], m9
+ mova [cq+32*13], m14
+ IDCT32_END 3, 14, 8, 9, 10, 2, 0
+ mova [cq+32*19], m8
+ mova [cq+32*28], m9
+ mova [cq+32*12], m14
+ IDCT32_END 4, 14, 8, 9, 10, 2, 0
+ mova [cq+32*20], m8
+ mova [cq+32*27], m9
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ IDCT32_END 5, 10, 0, 1, 2, 2, 0
+ mova [cq+32*21], m0
+ mova [cq+32*26], m1
+ IDCT32_END 6, 9, 0, 1, 2, 2, 0
+ mova [cq+32*22], m0
+ mova [cq+32*25], m1
+ IDCT32_END 7, 8, 0, 1, 2, 2, 0
+ mova [cq+32*23], m0
+ mova [cq+32*24], m1
+ mova m0, [cq+32* 0]
+ mova m1, [cq+32* 1]
+ mova m2, [cq+32* 2]
+ mova m11, m14
+ mova m12, [cq+32*12]
+ mova m13, [cq+32*13]
+ mova m14, [cq+32*14]
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1
+
cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jnz .full
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 8
+ or r3d, 8
.dconly:
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
+ paddsw xm0, xm3
vpbroadcastw m0, xm0
- vpbroadcastd m4, [pixel_10bpc_max]
- pxor m3, m3
.dconly_loop:
- paddw m1, m0, [dstq+32*0]
- paddw m2, m0, [dstq+32*1]
- pmaxsw m1, m3
- pmaxsw m2, m3
- pminsw m1, m4
- pminsw m2, m4
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ psubusw m1, m3
+ psubusw m2, m3
mova [dstq+32*0], m1
mova [dstq+32*1], m2
add dstq, strideq
@@ -5979,6 +6401,39 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
RET
.full:
PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ call .pass1
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
+ lea r6, [deint_shuf+128]
+ vpbroadcastd m11, [pw_2048]
+ mov r4, dstq
+ call .pass2
+ mova m0, [r5+32*3] ; 16 17
+ mova m1, [r5+32*2] ; 30 31
+ mova m2, [r5+32*1] ; 18 19
+ mova m3, [r5+32*0] ; 28 29
+ mova m4, [r5-32*1] ; 20 21
+ mova m5, [r5-32*2] ; 26 27
+ mova m6, [r5-32*3] ; 22 23
+ mova m7, [r5-32*4] ; 24 25
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ lea dstq, [r4+32]
+ call .pass2
+ RET
+ALIGN function_align
+.pass2:
+ call m(idct_16x8_internal_8bpc).main
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m11, m4
+ pmulhrsw m1, m11, m5
+ pmulhrsw m2, m11, m6
+ pmulhrsw m3, m11, m7
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.pass1:
mova m0, [cq+32* 1]
mova m1, [cq+32* 7]
mova m2, [cq+32* 9]
@@ -5988,10 +6443,7 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
mova m6, [cq+32*25]
mova m7, [cq+32*31]
vpbroadcastd m11, [pd_2048]
- vpbroadcastd m12, [clip_18b_min]
- vpbroadcastd m13, [clip_18b_max]
vpbroadcastd m14, [pd_2896]
- lea r6, [rsp+32*4]
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
mova m0, [cq+32* 3]
mova m1, [cq+32* 5]
@@ -6021,37 +6473,12 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
mova m7, [cq+32*28]
call m(idct_8x8_internal_10bpc).main
call m(idct_8x16_internal_10bpc).main_evenhalf
- call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
- lea r6, [deint_shuf+128]
- vpbroadcastd m11, [pw_2048]
- mov r4, dstq
- call .pass2
- mova m0, [r5+32*3] ; 16 17
- mova m1, [r5+32*2] ; 30 31
- mova m2, [r5+32*1] ; 18 19
- mova m3, [r5+32*0] ; 28 29
- mova m4, [r5-32*1] ; 20 21
- mova m5, [r5-32*2] ; 26 27
- mova m6, [r5-32*3] ; 22 23
- mova m7, [r5-32*4] ; 24 25
- call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
- lea dstq, [r4+32]
- call .pass2
- RET
-ALIGN function_align
-.pass2:
- call m(idct_16x8_internal_8bpc).main
- REPX {pmulhrsw x, m11}, m0, m1, m2, m3
- call m(idct_16x8_internal_10bpc).write_16x4_start
- pmulhrsw m0, m11, m4
- pmulhrsw m1, m11, m5
- pmulhrsw m2, m11, m6
- pmulhrsw m3, m11, m7
- jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ ret
cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
- vpbroadcastd m5, [pw_4096]
vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_4096]
pxor m6, m6
mov r6d, eobd
add eobb, 21
@@ -6078,6 +6505,47 @@ cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
jge .loop
RET
+cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .full
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
+.full:
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1
+ call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end
+ mov r4, dstq
+ call m(idct_16x8_internal_12bpc).pass2_main
+ mova m0, [cq+32* 0] ; 16
+ mova m1, [cq+32* 1] ; 17
+ mova m2, [cq+32* 2] ; 18
+ mova m3, [cq+32* 3] ; 19
+ mova m4, [cq+32* 4] ; 20
+ mova m5, [cq+32* 5] ; 21
+ mova m6, [cq+32* 6] ; 22
+ mova m7, [cq+32* 7] ; 23
+ mova m8, [cq+32* 8] ; 24
+ mova m9, [cq+32* 9] ; 25
+ mova m10, [cq+32*10] ; 26
+ mova m11, [cq+32*11] ; 27
+ mova m12, [cq+32*12] ; 28
+ mova m13, [cq+32*13] ; 29
+ mova m14, [cq+32*14] ; 30
+ mova m15, [cq+32*15] ; 31
+ lea dstq, [r4+32]
+ call m(idct_16x8_internal_12bpc).pass2_main
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1
+
%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
mova m%4, [%2]
paddsw m%3, m%1, m%4
@@ -6121,13 +6589,14 @@ cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
REPX {mova [r6+32*x], m4}, 0, 1, 2, 3
jmp .fast
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 32
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
.eob44:
mova [r4+16*0], xm0
mova [r4+16*1], xm3
@@ -6472,14 +6941,15 @@ cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
jmp .end
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 16
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 6144
- sar r6d, 13
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
.full:
add cq, 32
@@ -6742,9 +7212,10 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
.fast:
lea r4, [rsp+32*71]
@@ -7019,12 +7490,13 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 64
- add r6d, 10240
- sar r6d, 14
- jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
+ or r3d, 64
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
.fast:
lea r4, [rsp+32*38]
pxor m0, m0
@@ -7246,7 +7718,7 @@ ALIGN function_align
REPX {pmaxsd x, m12}, m8, m1, m6, m2
REPX {pminsd x, m13}, m8, m1, m6, m2
ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
- ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a
REPX {pmaxsd x, m12}, m0, m3, m7, m4
REPX {pminsd x, m13}, m0, m3, m7, m4
vpbroadcastd m10, [r5+4*10]
@@ -7301,7 +7773,7 @@ ALIGN function_align
REPX {pmaxsd x, m12}, m8, m1, m3, m4
REPX {pminsd x, m13}, m8, m1, m3, m4
ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
- ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a
REPX {pmaxsd x, m12}, m0, m2, m5, m7
REPX {pminsd x, m13}, m0, m5, m2, m7
psubd m6, m2, m7 ; t48a
@@ -7358,14 +7830,15 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 64
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 6144
- sar r6d, 13
+ or r3d, 64
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
.fast:
lea r4, [rsp+32*70]
@@ -7540,30 +8013,26 @@ ALIGN function_align
cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jnz .normal
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 16
+ or r3d, 16
.dconly:
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ vpbroadcastd m5, [dconly_10bpc]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
-%if WIN64
- movaps [rsp+8], xmm6
-%endif
+ paddsw xm0, xm5
vpbroadcastw m0, xm0
- vpbroadcastd m6, [pixel_10bpc_max]
- pxor m5, m5
.dconly_loop:
- paddw m1, m0, [dstq+32*0]
- paddw m2, m0, [dstq+32*1]
- paddw m3, m0, [dstq+32*2]
- paddw m4, m0, [dstq+32*3]
- REPX {pmaxsw x, m5}, m1, m2, m3, m4
- REPX {pminsw x, m6}, m1, m2, m3, m4
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ paddsw m3, m0, [dstq+32*2]
+ paddsw m4, m0, [dstq+32*3]
+ REPX {psubusw x, m5}, m1, m2, m3, m4
mova [dstq+32*0], m1
mova [dstq+32*1], m2
mova [dstq+32*2], m3
@@ -7571,9 +8040,6 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
add dstq, strideq
dec r3d
jg .dconly_loop
-%if WIN64
- movaps xmm6, [rsp+8]
-%endif
RET
.normal:
PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob
@@ -7814,14 +8280,14 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 32
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 6144
- sar r6d, 13
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
.fast:
pxor m0, m0
@@ -7963,9 +8429,9 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 64
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly
.fast:
pxor m0, m0
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm
new file mode 100644
index 00000000000..b05fde54dc8
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm
@@ -0,0 +1,2599 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23
+ db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31
+ db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55
+ db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63
+idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51
+ db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59
+ db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17
+ db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25
+iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23
+ db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31
+ db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19
+ db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27
+permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13
+ db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29
+ db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15
+ db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31
+permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2
+ db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6
+ db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7
+ db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3
+permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6
+ db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14
+ db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7
+ db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15
+idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
+ db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
+ db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
+ db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
+idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25
+ db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57
+ db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29
+ db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61
+idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30
+ db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62
+ db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31
+ db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63
+
+pw_2048_m2048: times 16 dw 2048
+pw_m2048_2048: times 16 dw -2048
+pw_2048: times 16 dw 2048
+
+; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-
+%macro COEF_PAIR 2-3 0 ; a, b, flags
+%if %3 == 1
+pd_%1_m%2: dd %1, %1, -%2, -%2
+%define pd_%1 (pd_%1_m%2 + 4*0)
+%define pd_m%2 (pd_%1_m%2 + 4*2)
+%elif %3 == 2
+pd_m%1_%2: dd -%1, -%1, %2, %2
+%define pd_m%1 (pd_m%1_%2 + 4*0)
+%define pd_%2 (pd_m%1_%2 + 4*2)
+%else
+pd_%1_%2: dd %1, %1, %2, %2
+%define pd_%1 (pd_%1_%2 + 4*0)
+%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3 == 3
+%define pd_%2_m%2 pd_%2
+dd -%2, -%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR 201, 995
+COEF_PAIR 401, 1189, 1
+COEF_PAIR 401, 1931
+COEF_PAIR 401, 3920
+COEF_PAIR 799, 2276, 1
+COEF_PAIR 799, 3406
+COEF_PAIR 799, 4017
+COEF_PAIR 1380, 601
+COEF_PAIR 1751, 2440
+COEF_PAIR 2598, 1189
+COEF_PAIR 2598, 1931, 2
+COEF_PAIR 2598, 3612
+COEF_PAIR 2751, 2106
+COEF_PAIR 2896, 1567, 3
+COEF_PAIR 2896, 3784, 3
+COEF_PAIR 3035, 3513
+COEF_PAIR 3166, 1931
+COEF_PAIR 3166, 3612
+COEF_PAIR 3166, 3920
+COEF_PAIR 3703, 3290
+COEF_PAIR 3857, 4052
+COEF_PAIR 4017, 2276
+COEF_PAIR 4017, 3406
+COEF_PAIR 4076, 1189
+COEF_PAIR 4076, 3612
+COEF_PAIR 4076, 3920
+COEF_PAIR 4091, 3973
+
+pw_5: times 2 dw 5
+pw_4096 times 2 dw 4096
+pw_1697x16: times 2 dw 1697*16
+pw_2896x8: times 2 dw 2896*8
+pixel_10bpc_max: times 2 dw 0x03ff
+dconly_10bpc: times 2 dw 0x7c00
+clip_18b_min: dd -0x20000
+clip_18b_max: dd 0x1ffff
+pd_1: dd 1
+pd_2: dd 2
+pd_1448: dd 1448
+pd_2048: dd 2048
+pd_3071: dd 3071 ; 1024 + 2048 - 1
+pd_3072: dd 3072 ; 1024 + 2048
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
+pd_5793: dd 5793
+
+cextern int8_permA
+cextern idct_8x8_internal_8bpc_avx512icl.main
+cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_8x16_internal_8bpc_avx512icl.main
+cextern idct_8x16_internal_8bpc_avx512icl.main2
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast2
+cextern iadst_8x16_internal_8bpc_avx512icl.main2
+cextern idct_16x8_internal_8bpc_avx512icl.main
+cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_16x16_internal_8bpc_avx512icl.main
+cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end
+cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main
+
+SECTION .text
+
+%define o_base (pw_2048+4*128)
+%define o_base_8bpc (int8_permA+64*18)
+%define o(x) (r5 - o_base + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+INIT_ZMM avx512icl
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 1 = inv_dst1, 2 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+%if %8 < 4096
+ vpbroadcastd m%3, [o(pd_%8)]
+%else
+ vbroadcasti32x4 m%3, [o(pd_%8)]
+%endif
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+%if %7 < 4096
+ vpbroadcastd m%5, [o(pd_%7)]
+%else
+ vbroadcasti32x4 m%5, [o(pd_%7)]
+%endif
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 2
+ psubd m%4, m%6, m%4
+ psubd m%2, m%4, m%2
+%else
+%ifnum %6
+ paddd m%4, m%6
+%endif
+ paddd m%2, m%4
+%endif
+%ifnum %6
+ paddd m%1, m%6
+%endif
+%if %9 & 1
+ psubd m%1, m%3, m%1
+%else
+ psubd m%1, m%3
+%endif
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size
+cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_10bpc)
+ lea r5, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%4_internal_10bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+%if %3
+ add eobd, %3
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd ym2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw ym1, r6d
+ paddsw ym1, ym2
+.dconly_loop:
+ mova xm0, [dstq+strideq*0]
+ vinserti32x4 ym0, [dstq+strideq*1], 1
+ paddsw ym0, ym1
+ psubusw ym0, ym2
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call .load
+ vpermi2q m1, m0, m2 ; 1 5
+ vpermi2q m3, m6, m4 ; 7 3
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ call .main_end
+ mova m4, [o(idct8x8p)]
+ packssdw m0, m2 ; 0 1 4 5
+ packssdw m1, m3 ; 3 2 7 6
+ vpermb m0, m4, m0
+ vprolq m1, 32
+ vpermb m2, m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ call m(idct_8x8_internal_8bpc).main
+ mova m10, [permC]
+ vpbroadcastd m12, [pw_2048]
+.end:
+ vpermt2q m0, m10, m1
+ vpermt2q m2, m10, m3
+.end2:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+ pmulhrsw m8, m12, m0
+ call .write_8x4_start
+ pmulhrsw m8, m12, m2
+.write_8x4:
+ lea dstq, [dstq+strideq*4]
+ add cq, 64*2
+.write_8x4_start:
+ mova xm9, [dstq+strideq*0]
+ vinserti32x4 ym9, [dstq+strideq*1], 1
+ vinserti32x4 m9, [dstq+strideq*2], 2
+ vinserti32x4 m9, [dstq+r6 ], 3
+ mova [cq+64*0], m10
+ mova [cq+64*1], m10
+ paddw m9, m8
+ pmaxsw m9, m10
+ pminsw m9, m11
+ mova [dstq+strideq*0], xm9
+ vextracti32x4 [dstq+strideq*1], ym9, 1
+ vextracti32x4 [dstq+strideq*2], m9, 2
+ vextracti32x4 [dstq+r6 ], m9, 3
+ ret
+ALIGN function_align
+.load:
+ mova m0, [cq+64*0] ; 0 1
+ mova m4, [cq+64*1] ; 2 3
+ mova m1, [o(permB)]
+ mova m2, [cq+64*2] ; 4 5
+ mova m6, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m5, m1, 32
+ vpbroadcastd m12, [o(pd_2896)]
+ mova m3, m1
+ vpbroadcastd m11, [o(pd_1)]
+ ret
+ALIGN function_align
+.main_fast2: ; bottom three-quarters are zero
+ vbroadcasti32x4 m8, [o(pd_799_4017)]
+ pmulld m8, m1 ; t4 t7
+ vpmulld m0, [o(pd_2896)] {1to16} ; dct4 out0 out1
+ REPX {paddd x, m13}, m8, m0
+ REPX {psrad x, 12 }, m8, m0
+ pmulld m3, m8, m12
+ mova m2, m0 ; dct4 out3 out2
+ jmp .main3
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m3, [o(pd_4017_3406)]
+ vbroadcasti32x4 m8, [o(pd_799_m2276)]
+ vbroadcasti32x4 m2, [o(pd_2896_3784)]
+ vbroadcasti32x4 m9, [o(pd_2896_1567)]
+ pmulld m3, m1 ; t4a t5a
+ pmulld m1, m8 ; t7a t6a
+ pmulld m2, m0 ; t0 t3
+ pmulld m0, m9 ; t1 t2
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276
+ ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784
+.main2:
+ REPX {paddd x, m13}, m1, m3, m0, m2
+ REPX {psrad x, 12 }, m1, m3, m0, m2
+ punpcklqdq m8, m1, m3 ; t4a t7a
+ punpckhqdq m1, m3 ; t5a t6a
+ psubd m3, m8, m1 ; t5a t6a
+ paddd m8, m1 ; t4 t7
+ pmaxsd m3, m14
+ punpckhqdq m1, m2, m0 ; t3 t2
+ pminsd m3, m15
+ punpcklqdq m2, m0 ; t0 t1
+ pmulld m3, m12
+ paddd m0, m2, m1 ; dct4 out0 out1
+ psubd m2, m1 ; dct4 out3 out2
+ REPX {pmaxsd x, m14}, m8, m0, m2
+ REPX {pminsd x, m15}, m8, m0, m2
+.main3:
+ pshufd m1, m3, q1032
+ paddd m3, m13
+ psubd m9, m3, m1
+ paddd m3, m1
+ psrad m9, 12
+ psrad m3, 12
+ punpckhqdq m1, m8, m3 ; t7 t6
+ shufpd m8, m9, 0xaa ; t4 t5
+ ret
+.main_end:
+ paddd m0, m11
+ paddd m2, m11
+ psubd m3, m0, m1 ; out7 out6
+ paddd m0, m1 ; out0 out1
+ paddd m1, m2, m8 ; out3 out2
+ psubd m2, m8 ; out4 out5
+ REPX {vpsravd x, m11}, m0, m2, m3, m1
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+INV_TXFM_8X8_FN adst, adst
+
+cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ punpckldq m1, m2, m4 ; out4 out6
+ punpckhdq m2, m0 ; -out5 -out7
+ punpckldq m0, m3 ; out0 out2
+ punpckhdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.pass1_end:
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m0, m1 ; 0 2 4 6
+ packssdw m4, m3 ; 1 3 5 7
+ psrlq m1, [o(permB)], 8
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ psrlq m2, m1, 32
+ vpermi2q m1, m0, m3
+ vpermt2q m0, m2, m3
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ movu m10, [permC+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ jmp m(idct_8x8_internal_10bpc).end
+.main_pass2:
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ lea r5, [o_base_8bpc]
+ pshufd ym4, ym0, q1032
+ pshufd ym5, ym1, q1032
+ jmp m(iadst_8x8_internal_8bpc).main_pass2
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612
+ ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189
+ psubd m4, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ REPX {pmaxsd x, m14}, m4, m2, m0, m1
+ REPX {pminsd x, m15}, m4, m2, m0, m1
+ pxor m5, m5
+ psubd m5, m4
+ shufpd m4, m2, 0xaa ; t4 t7
+ shufpd m2, m5, 0xaa ; t5 -t6
+ ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784
+ punpckhqdq m3, m0, m1
+ punpcklqdq m0, m1
+ psubd m1, m0, m3 ; t2 t3
+ paddd m0, m3 ; out0 -out7
+ punpckhqdq m3, m4, m2 ; t7a t6a
+ punpcklqdq m4, m2 ; t5a t4a
+ psubd m2, m4, m3 ; t7 t6
+ paddd m4, m3 ; out6 -out1
+ REPX {pmaxsd x, m14}, m1, m2
+ REPX {pminsd x, m15}, m1, m2
+ shufpd m3, m1, m2, 0xaa
+ shufpd m1, m2, 0x55
+ pmulld m3, m12
+ pmulld m1, m12
+ paddd m3, m13
+ psubd m2, m3, m1
+ paddd m3, m1
+ psrad m2, 12 ; out4 -out5
+ pshufd m3, m3, q1032
+ psrad m3, 12 ; out2 -out3
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, identity
+INV_TXFM_8X8_FN flipadst, flipadst
+
+cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call m(iadst_8x8_internal_10bpc).main
+ punpckhdq m1, m3, m4 ; -out3 -out1
+ punpckldq m3, m0 ; out2 out0
+ punpckhdq m0, m2 ; -out7 -out5
+ punpckldq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_8x8_internal_10bpc).main_pass2
+ movu m10, [permC+1]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ lea r6, [strideq*3]
+ vpermt2q m0, m10, m1 ; 7 6 5 4
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m2, m10, m3 ; 3 2 1 0
+ pxor m10, m10
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m0
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ mova m1, [cq+64*0]
+ packssdw m1, [cq+64*2] ; 0 4 1 5
+ mova m2, [cq+64*1] ; 2 6 3 7
+ packssdw m2, [cq+64*3]
+ mova m0, [o(idtx8x8p)]
+ vpermb m1, m0, m1
+ vpermb m2, m0, m2
+ punpckldq m0, m1, m2 ; 0 1 4 5
+ punpckhdq m1, m2 ; 2 3 6 7
+ jmp tx2q
+.pass2:
+ movu m3, [o(permC+2)]
+ vpbroadcastd m12, [o(pw_4096)]
+ psrlq m2, m3, 32
+ vpermi2q m2, m0, m1
+ vpermt2q m0, m3, m1
+ jmp m(idct_8x8_internal_10bpc).end2
+
+%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, 35
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, adst
+
+cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call .load
+ call .main
+ call .main_end
+.pass1_end:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ jmp tx2q
+.pass2:
+ mova m8, [o(idct8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpcklqdq m8, m0, m2 ; 15 1
+ punpckhqdq m0, m2 ; 7 9
+ punpckhqdq m1, m5, m4 ; 3 13
+ punpcklqdq m5, m4 ; 11 5
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym7, m8, 1 ; 14 2
+ vextracti32x8 ym3, m0, 1 ; 6 10
+ vextracti32x8 ym6, m1, 1 ; 12 4
+ vextracti32x8 ym9, m5, 1 ; 8 0
+ call m(idct_8x16_internal_8bpc).main2
+ mova m8, [permC]
+ vpbroadcastd m12, [pw_2048]
+ vpermt2q m0, m8, m1
+ lea r6, [strideq*3]
+ vpermt2q m2, m8, m3
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m4, m8, m5
+ pxor m10, m10
+ vpermt2q m6, m8, m7
+ pmulhrsw m8, m12, m0
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*1]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*7]
+ mova ym7, [cq+64*3]
+ call .round_input_fast
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ movu m6, [o(permC+3)]
+ packssdw m3, m1, m3
+ packssdw m1, m0, m2
+ vprolq m3, 32
+ vpermd m1, m6, m1
+ vpermd m3, m6, m3
+ mova ym0, ym1 ; 0 4
+ vextracti32x8 ym1, m1, 1 ; 1 5
+ mova ym2, ym3 ; 2 6
+ vextracti32x8 ym3, m3, 1 ; 3 7
+ jmp tx2q
+ALIGN function_align
+.round_input_fast:
+ movshdup m8, [o(permB)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpermt2q m0, m8, m4
+ vpermt2q m1, m8, m5
+ vpermt2q m2, m8, m6
+ vpermt2q m3, m8, m7
+ vpbroadcastd m13, [o(pd_2048)]
+ REPX {pmulld x, m12}, m0, m1, m2, m3
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m11, [o(pd_1)]
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ ret
+ALIGN function_align
+.load:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+.load2:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m0, m12, [cq+64*0]
+ pmulld m1, m12, [cq+64*1]
+ pmulld m2, m12, [cq+64*2]
+ pmulld m3, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pmulld m4, m12, [cq+64*4]
+ pmulld m5, m12, [cq+64*5]
+ pmulld m6, m12, [cq+64*6]
+ pmulld m7, m12, [cq+64*7]
+ REPX {paddd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, 13, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, 13, 799, 4017 ; t4a t7a
+ pmulld m0, m12
+ pmulld m4, m12
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ psubd m5, m7, m3 ; t6a
+ paddd m7, m3 ; t7
+ pmaxsd m5, m14
+ pmaxsd m1, m14
+ pminsd m5, m15
+ pminsd m1, m15
+ pmulld m5, m12
+ pmulld m1, m12
+ ITX_MULSUB_2D 2, 6, 3, 9, 10, 13, 1567, 3784 ; t2 t3
+ pmaxsd m8, m14
+ pmaxsd m7, m14
+ paddd m0, m13
+ pminsd m8, m15
+ psubd m3, m0, m4
+ paddd m5, m13
+ paddd m0, m4
+ psubd m4, m5, m1
+ paddd m5, m1
+ REPX {psrad x, 12 }, m3, m5, m0, m4
+ paddd m1, m3, m2 ; dct4 out1
+ psubd m2, m3, m2 ; dct4 out2
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ pminsd m6, m15, m7
+ REPX {pmaxsd x, m14}, m0, m1, m2, m3
+ REPX {pminsd x, m15}, m0, m1, m2, m3
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_1)]
+.main_end2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ psubd m7, m0, m6 ; out7
+ paddd m0, m6 ; out0
+ psubd m6, m1, m5 ; out6
+ paddd m1, m5 ; out1
+ psubd m5, m2, m4 ; out5
+ paddd m2, m4 ; out2
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, identity, 35
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, adst
+
+cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call .main
+ psrad m0, 1
+ psrad m1, 1
+ psrad m6, m10, 1
+ psrad m7, m11, 1
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call .fast_main
+ punpcklqdq m1, m2, m4 ; out4 out6
+ punpckhqdq m2, m0 ; -out5 -out7
+ punpcklqdq m0, m3 ; out0 out2
+ punpckhqdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.fast_end:
+ movu m5, [o(permC+3)]
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m2, m0, m1 ; 0 2 4 6
+ packssdw m3, m4, m3 ; 1 3 5 7
+ vpermd m2, m5, m2
+ vpermd m3, m5, m3
+ mova ym0, ym2
+ vextracti32x8 ym2, m2, 1
+ mova ym1, ym3
+ vextracti32x8 ym3, m3, 1
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ movu m4, [permB+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ psrlq m7, m4, 8
+ vpermi2q m4, m0, m3 ; 0 1 2 3
+ psrlq m5, m7, 24
+ vpermi2q m7, m0, m3 ; 12 13 14 15
+ psrlq m6, m5, 8
+ vpermq m5, m5, m1 ; 4 5 6 7
+ vpermq m6, m6, m2 ; 8 9 10 11
+.pass2_end:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ pxor m10, m10
+ lea r6, [strideq*3]
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m5
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m7
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a
+ psubd m8, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m5, m1 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7
+ REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7
+ vpbroadcastd m10, [o(pd_1567)]
+ vpbroadcastd m11, [o(pd_3784)]
+ ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a
+ ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a
+ vpbroadcastd m12, [o(pd_1448)]
+ psubd m9, m6, m8 ; t7
+ paddd m6, m8 ; out6
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m2 ; t2
+ paddd m0, m2 ; out0
+ psubd m2, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ REPX {pmaxsd x, m14}, m5, m3, m2, m9
+ REPX {pminsd x, m15}, m5, m3, m2, m9
+ REPX {pmulld x, m12}, m5, m3, m2, m9
+ vpbroadcastd m4, [o(pd_1)]
+ psubd m8, m5, m3 ; (t2 - t3) * 1448
+ paddd m3, m5 ; (t2 + t3) * 1448
+ psubd m5, m2, m9 ; (t6 - t7) * 1448
+ paddd m2, m9 ; (t6 + t7) * 1448
+ vpbroadcastd m9, [o(pd_3072)]
+ paddd m0, m4
+ psubd m1, m4, m1
+ paddd m10, m6, m4
+ psubd m11, m4, m7
+ paddd m2, m9
+ paddd m8, m9
+ vpbroadcastd m9, [o(pd_3071)]
+ psubd m3, m9, m3
+ psubd m9, m5
+ ret
+ALIGN function_align
+.fast_main:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*7]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*3]
+ mova ym7, [cq+64*1]
+ call m(idct_8x16_internal_10bpc).round_input_fast
+ jmp m(iadst_8x8_internal_10bpc).main
+ALIGN function_align
+.pass2_main:
+ mova m8, [o(iadst8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ vpbroadcastd m10, [o(pw_2896x8)]
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m2, m3
+ punpckldq m2, m3
+ lea r5, [o_base_8bpc]
+ punpckhqdq m4, m0, m2 ; 12 3 14 1
+ punpcklqdq m0, m2 ; 0 15 2 13
+ punpckhqdq m6, m5, m1 ; 8 7 10 5
+ punpcklqdq m5, m1 ; 4 11 6 9
+ call m(iadst_8x16_internal_8bpc).main2
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m10 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m10 ; out8 -out11 -out9 out10
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, identity, 35
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+
+cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call m(iadst_8x16_internal_10bpc).main
+ psrad m7, m0, 1
+ psrad m0, m11, 1
+ psrad m6, m1, 1
+ psrad m1, m10, 1
+ psrad m5, m2, 12
+ psrad m2, m9, 12
+ psrad m4, m3, 12
+ psrad m3, m8, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_8x16_internal_10bpc).fast_main
+ punpckhqdq m1, m3, m4 ; -out3 -out1
+ punpcklqdq m3, m0 ; out2 out0
+ punpckhqdq m0, m2 ; -out7 -out5
+ punpcklqdq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x16_internal_10bpc).fast_end
+.pass2:
+ call m(iadst_8x16_internal_10bpc).pass2_main
+ movu m7, [permB+2]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ psrlq m4, m7, 8
+ vpermi2q m7, m3, m0 ; 3 2 1 0
+ psrlq m5, m4, 24
+ vpermi2q m4, m3, m0 ; 15 14 13 12
+ psrlq m6, m5, 8
+ vpermq m5, m5, m2 ; 11 10 9 8
+ vpermq m6, m6, m1 ; 7 6 5 4
+ jmp m(iadst_8x16_internal_10bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ pmulhrsw m4, m8, m0
+ pmulhrsw m5, m8, m1
+ pmulhrsw m6, m8, m2
+ pmulhrsw m7, m8, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ vpbroadcastd m7, [o(pw_2048)]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ vpbroadcastd m6, [o(pixel_10bpc_max)]
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m1
+ punpckhdq m4, m1
+ pxor m5, m5
+ punpckhqdq m1, m0, m2 ; 1 5 9 13
+ punpcklqdq m0, m2 ; 0 4 8 12
+ punpcklqdq m2, m3, m4 ; 2 6 10 14
+ punpckhqdq m3, m4 ; 3 7 11 15
+ lea r6, [strideq*3]
+ pmulhrsw m0, m7
+ call .write_8x4_start
+ pmulhrsw m0, m7, m1
+ call .write_8x4
+ pmulhrsw m0, m7, m2
+ call .write_8x4
+ pmulhrsw m0, m7, m3
+.write_8x4:
+ add dstq, strideq
+ add cq, 64*2
+.write_8x4_start:
+ mova xm4, [dstq+strideq*0]
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ vinserti32x4 m4, [dstq+strideq*8], 2
+ vinserti32x4 m4, [dstq+r6*4 ], 3
+ mova [cq+64*0], m5
+ mova [cq+64*1], m5
+ paddw m4, m0
+ pmaxsw m4, m5
+ pminsw m4, m6
+ mova [dstq+strideq*0], xm4
+ vextracti32x4 [dstq+strideq*4], ym4, 1
+ vextracti32x4 [dstq+strideq*8], m4, 2
+ vextracti32x4 [dstq+r6*4 ], m4, 3
+ ret
+
+%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+.dconly:
+ vpbroadcastd m2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m1, r6d
+ paddsw m1, m2
+.dconly_loop:
+ mova ym0, [dstq+strideq*0]
+ vinserti32x8 m0, [dstq+strideq*1], 1
+ paddsw m0, m1
+ psubusw m0, m2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity, -21
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, adst
+
+cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m4, m12, [cq+64*0] ; 0 1
+ pmulld m9, m12, [cq+64*1] ; 2 3
+ pmulld m8, m12, [cq+64*2] ; 4 5
+ pmulld m7, m12, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m2, m2
+ mova m15, [o(permB)]
+ REPX {mova [cq+64*x], m2}, 0, 1, 2, 3
+ psrlq m0, m15, 32
+ REPX {paddd x, m13}, m4, m9, m8, m7
+ vpbroadcastd m14, [o(clip_18b_min)]
+ REPX {psrad x, 12 }, m4, m8, m9, m7
+ mova m1, m0
+ vpermi2q m0, m4, m8 ; 0 4
+ cmp eobd, 43
+ jl .fast
+ pmulld m5, m12, [cq+64*4] ; 8 9
+ pmulld m10, m12, [cq+64*5] ; 10 11
+ pmulld m11, m12, [cq+64*6] ; 12 13
+ pmulld m6, m12, [cq+64*7] ; 14 15
+ REPX {mova [cq+64*x], m2}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m5, m10, m11, m6
+ REPX {psrad x, 12 }, m10, m5, m11, m6
+ mova m2, m1
+ vpermi2q m1, m9, m10 ; 2 10
+ mova m3, m2
+ vpermi2q m2, m5, m11 ; 8 12
+ vpermi2q m3, m6, m7 ; 14 6
+ vpermt2q m4, m15, m11 ; 1 13
+ vpermt2q m6, m15, m9 ; 15 3
+ vpermt2q m5, m15, m8 ; 9 5
+ vpermt2q m7, m15, m10 ; 7 11
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main
+ call .main
+ jmp .pass1_end
+.fast:
+ vpermi2q m1, m9, m7 ; 2 6
+ vpermt2q m4, m15, m9 ; 1 3
+ vpermt2q m7, m15, m8 ; 7 5
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main_fast
+ call .main_fast
+.pass1_end:
+ call m(idct_8x16_internal_10bpc).main_end
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+.pass1_end2:
+ mova m10, m9
+ mova m11, m8
+ call .transpose_16x8
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x8_internal_8bpc).main
+ movshdup m4, [permC]
+ vpbroadcastd m13, [pw_2048]
+ psrlq m5, m4, 8
+ vpermq m0, m4, m0
+ vpermq m1, m5, m1
+ vpermq m2, m4, m2
+ vpermq m3, m5, m3
+.end:
+ vpbroadcastd m15, [pixel_10bpc_max]
+ pxor m14, m14
+ pmulhrsw m8, m13, m0
+ pmulhrsw m9, m13, m1
+ lea r6, [strideq*3]
+ call .write_16x4
+ pmulhrsw m8, m13, m2
+ pmulhrsw m9, m13, m3
+.write_16x4:
+ mova ym10, [dstq+strideq*0]
+ vinserti32x8 m10, [dstq+strideq*1], 1
+ paddw m8, m10
+ mova ym10, [dstq+strideq*2]
+ vinserti32x8 m10, [dstq+r6 ], 1
+ paddw m9, m10
+ pmaxsw m8, m14
+ pmaxsw m9, m14
+ pminsw m8, m15
+ pminsw m9, m15
+ mova [dstq+strideq*0], ym8
+ vextracti32x8 [dstq+strideq*1], m8, 1
+ mova [dstq+strideq*2], ym9
+ vextracti32x8 [dstq+r6 ], m9, 1
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.main_fast2: ; bottom three-quarters are zero
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ pmulld m6, m4 ; t15 t12
+ pmulld m4, m3 ; t9 t10
+ REPX {paddd x, m13}, m6, m4
+ REPX {psrad x, 12 }, m6, m4
+ mova m5, m6 ; t14 t13
+ mova m9, m4 ; t8 t11
+ jmp .main3
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ vbroadcasti32x4 m5, [o(pd_m2598_1931)]
+ vbroadcasti32x4 m9, [o(pd_3166_3612)]
+ pmulld m6, m4 ; t15a t12a
+ pmulld m4, m3 ; t8a t11a
+ pmulld m5, m7 ; t9a t10a
+ pmulld m7, m9 ; t14a t13a
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189
+ ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612
+.main2:
+ REPX {paddd x, m13}, m4, m6, m5, m7
+ REPX {psrad x, 12 }, m4, m5, m6, m7
+ paddd m9, m4, m5 ; t8 t11
+ psubd m4, m5 ; t9 t10
+ psubd m5, m6, m7 ; t14 t13
+ paddd m6, m7 ; t15 t12
+ REPX {pmaxsd x, m14}, m5, m4, m9, m6
+ REPX {pminsd x, m15}, m5, m4, m9, m6
+.main3:
+ psubd m3, m0, m1 ; dct8 out7 out6
+ paddd m0, m1 ; dct8 out0 out1
+ vbroadcasti32x4 m7, [o(pd_3784_m3784)]
+ pmulld m7, m5
+ vpmulld m5, [o(pd_1567)] {1to16}
+ paddd m1, m2, m8 ; dct8 out3 out2
+ psubd m2, m8 ; dct8 out4 out5
+ vbroadcasti32x4 m8, [o(pd_1567_m1567)]
+ pmulld m8, m4
+ vpmulld m4, [o(pd_3784)] {1to16}
+ REPX {pmaxsd x, m14}, m0, m1
+ REPX {pminsd x, m15}, m0, m1
+ paddd m7, m13
+ paddd m5, m13
+ paddd m7, m8
+ psubd m5, m4
+ psrad m7, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
+ punpckhqdq m4, m9, m7
+ punpcklqdq m8, m9, m5
+ punpckhqdq m5, m6, m5
+ punpcklqdq m6, m7
+ psubd m7, m8, m4 ; t11a t10
+ paddd m8, m4 ; t8a t9
+ psubd m4, m6, m5 ; t12a t13
+ paddd m6, m5 ; t15a t14
+ REPX {pmaxsd x, m14}, m4, m7
+ REPX {pminsd x, m15}, m4, m7
+ pmulld m4, m12
+ pmulld m7, m12
+ REPX {pmaxsd x, m14}, m2, m3, m6, m8
+ REPX {pminsd x, m15}, m2, m3, m6, m8
+ paddd m4, m13
+ paddd m5, m4, m7
+ psubd m4, m7
+ psrad m4, 12 ; t11 t10a
+ psrad m5, 12 ; t12 t13a
+ ret
+ALIGN function_align
+.transpose_16x8:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpermi2d m8, m0, m2
+ vpermt2d m0, m9, m2
+ vpermi2d m10, m1, m3
+ vpermi2d m11, m1, m3
+ punpckhwd m3, m8, m0
+ punpcklwd m1, m8, m0
+ punpckhwd m4, m10, m11
+ punpcklwd m2, m10, m11
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, identity, -21
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, adst
+
+cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ call .main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7
+ jmp m(idct_16x8_internal_10bpc).pass1_end2
+.pass2:
+ call .main_pass2
+ vpermq m8, m13, m0
+ vpermq m9, m13, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m13, m2
+ vpermq m9, m13, m3
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m2, m12, [cq+64*0]
+ pmulld m7, m12, [cq+64*1]
+ pmulld m1, m12, [cq+64*2]
+ pmulld m5, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m4, m4
+ mova m10, [o(permB)]
+ REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
+ REPX {paddd x, m13}, m2, m7, m1, m5
+ psrlq m6, m10, 32
+ REPX {psrad x, 12 }, m2, m7, m1, m5
+ mova m0, m6
+ vpermi2q m0, m2, m7 ; 0 2
+ vpermt2q m7, m10, m2 ; 3 1
+ mova m2, m6
+ vpermi2q m2, m1, m5 ; 4 6
+ vpermt2q m5, m10, m1 ; 7 5
+ cmp eobd, 43
+ jl .main_fast
+ pmulld m8, m12, [cq+64*4]
+ pmulld m3, m12, [cq+64*5]
+ pmulld m9, m12, [cq+64*6]
+ pmulld m1, m12, [cq+64*7]
+ REPX {mova [cq+64*x], m4}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m8, m3, m9, m1
+ REPX {psrad x, 12 }, m8, m3, m9, m1
+ mova m4, m6
+ vpermi2q m4, m8, m3 ; 8 10
+ vpermt2q m3, m10, m8 ; 11 9
+ vpermi2q m6, m9, m1 ; 12 14
+ vpermt2q m1, m10, m9 ; 15 13
+.main:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601
+ jmp .main2
+.main_fast:
+ vbroadcasti32x4 m1, [o(pd_4091_3973)]
+ vbroadcasti32x4 m8, [o(pd_201_995)]
+ vbroadcasti32x4 m3, [o(pd_3703_3290)]
+ vbroadcasti32x4 m9, [o(pd_1751_2440)]
+ vbroadcasti32x4 m4, [o(pd_2751_2106)]
+ vbroadcasti32x4 m10, [o(pd_3035_3513)]
+ vbroadcasti32x4 m6, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m1, m0
+ pmulld m0, m8
+ pmulld m3, m2
+ pmulld m2, m9
+ pmulld m4, m5
+ pmulld m5, m10
+ pmulld m6, m7
+ pmulld m7, m11
+.main2:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {psubd x, m13, x}, m1, m3
+ REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7
+ psubd m8, m0, m4 ; t8a t10a
+ paddd m0, m4 ; t0a t2a
+ psubd m4, m1, m5 ; t9a t11a
+ paddd m1, m5 ; t1a t3a
+ psubd m5, m2, m6 ; t12a t14a
+ paddd m2, m6 ; t4a t6a
+ psubd m6, m3, m7 ; t13a t15a
+ paddd m3, m7 ; t5a t7a
+ REPX {pmaxsd x, m14}, m8, m4, m5, m6
+ REPX {pminsd x, m15}, m8, m4, m5, m6
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m2, m1, m3
+ REPX {pminsd x, m15}, m0, m2, m1, m3
+ psubd m7, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ psubd m3, m4, m6 ; t12a t14a
+ paddd m4, m6 ; t8a t10a
+ psubd m6, m8, m5 ; t13a t15a
+ paddd m8, m5 ; t9a t11a
+ REPX {pmaxsd x, m14}, m7, m3, m2, m6
+ REPX {pminsd x, m15}, m7, m3, m2, m6
+ punpcklqdq m5, m3, m7 ; t12a t4
+ punpckhqdq m3, m7 ; t14a t6
+ punpckhqdq m7, m6, m2 ; t15a t7
+ punpcklqdq m6, m2 ; t13a t5
+ vpbroadcastd m11, [o(pd_1567)]
+ vpbroadcastd m10, [o(pd_3784)]
+ ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11
+ ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m4, m1, m8
+ REPX {pminsd x, m15}, m0, m4, m1, m8
+ punpckhqdq m2, m4, m0 ; t10a t2
+ punpcklqdq m4, m0 ; t8a t0
+ punpckhqdq m0, m8, m1 ; t11a t3
+ punpcklqdq m8, m1 ; t9a t1
+ paddd m1, m6, m7 ; out2 -out3
+ psubd m6, m7 ; t14a t6
+ paddd m7, m5, m3 ; -out13 out12
+ psubd m5, m3 ; t15a t7
+ psubd m3, m8, m0 ; t11 t3a
+ paddd m8, m0 ; out14 -out15
+ paddd m0, m4, m2 ; -out1 out0
+ psubd m4, m2 ; t10 t2a
+ REPX {pmaxsd x, m14}, m6, m5, m3, m4
+ mov r6d, 0x3333
+ REPX {pminsd x, m15}, m6, m5, m3, m4
+ kmovw k1, r6d
+ REPX {pmulld x, m12}, m6, m5, m3, m4
+ pxor m9, m9
+ REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8
+ paddd m6, m13
+ paddd m4, m13
+ paddd m2, m6, m5 ; -out5 out4
+ psubd m6, m5 ; out10 -out11
+ psubd m5, m4, m3 ; -out9 out8
+ paddd m3, m4 ; out6 -out7
+ REPX {psrad x, 12}, m2, m3, m5, m6
+ REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6
+ ret
+ALIGN function_align
+.main_pass2:
+ lea r5, [o_base_8bpc]
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m13, [permC]
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ vpbroadcastd m15, [pixel_10bpc_max]
+ pxor m14, m14
+ lea r6, [strideq*3]
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, identity, -21
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+
+cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_10bpc).main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_16x8_internal_10bpc).main_pass2
+ psrlq m13, 8
+ vpermq m8, m13, m3
+ vpermq m9, m13, m2
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m13, m1
+ vpermq m9, m13, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ vpbroadcastd m8, [o(pd_5793)]
+ vpbroadcastd m9, [o(pd_3072)]
+ pxor m10, m10
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m10}, 0, 1, 2, 3
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m10}, 4, 5, 6, 7
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
+ psrlq m8, [o(permA)], 16
+ psrlq m9, m8, 8
+ mova m10, m8
+ mova m11, m9
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ jmp tx2q
+.pass2:
+ movshdup m4, [o(permC)]
+ vpbroadcastd m13, [o(pw_4096)]
+ REPX {vpermq x, m4, x}, m0, m1, m2, m3
+ jmp m(idct_16x8_internal_10bpc).end
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, 28
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, adst
+
+cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 36
+ jl .fast
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 2]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64* 6]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64*10]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64*14]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+64* 1]
+ mova m17, [cq+64* 3]
+ mova m18, [cq+64* 5]
+ mova m19, [cq+64* 7]
+ mova m20, [cq+64* 9]
+ mova m21, [cq+64*11]
+ mova m22, [cq+64*13]
+ mova m23, [cq+64*15]
+ call .main
+ call .main_end
+.pass1_end:
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+.pass1_end2:
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ punpckhdq m7, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m3, m6
+ punpckldq m3, m6
+ vshufi32x4 m6, m0, m4, q3232
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m4, m8, ym3, 1
+ vshufi32x4 m8, m3, q3232
+ vinserti32x8 m3, m7, ym1, 1
+ vshufi32x4 m7, m1, q3232
+ vshufi32x4 m1, m2, m5, q3232
+ vinserti32x8 m2, ym5, 1
+ vshufi32x4 m5, m7, m1, q2020 ; 10 11
+ vshufi32x4 m7, m1, q3131 ; 14 15
+ vshufi32x4 m1, m3, m2, q2020 ; 2 3
+ vshufi32x4 m3, m2, q3131 ; 6 7
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+.pass1_end3:
+ mov r6d, 64*12
+ pxor m8, m8
+.zero_loop:
+ mova [cq+r6+64*3], m8
+ mova [cq+r6+64*2], m8
+ mova [cq+r6+64*1], m8
+ mova [cq+r6+64*0], m8
+ sub r6d, 64*4
+ jge .zero_loop
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x16_internal_8bpc).main
+ movshdup m10, [permC]
+ vpbroadcastd m13, [pw_2048]
+ psrlq m11, m10, 8
+ vpermq m8, m10, m0
+ vpermq m0, m11, m7
+ vpermq m7, m11, m1
+ vpermq m1, m10, m6
+ vpermq m6, m10, m2
+ vpermq m2, m11, m5
+ vpermq m5, m11, m3
+ vpermq m3, m10, m4
+.pass2_end:
+ lea r6, [strideq*3]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ pxor m14, m14
+ pmulhrsw m8, m13, m8
+ pmulhrsw m9, m13, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m8, m13, m6
+ pmulhrsw m9, m13, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m8, m13, m3
+ pmulhrsw m9, m13, m2
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m8, m13, m1
+ pmulhrsw m9, m13, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym2, [cq+64*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+64*2]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*1]
+ mova ym5, [cq+64*3]
+ mova ym6, [cq+64*5]
+ mova ym7, [cq+64*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_2)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, 13, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, 13, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, 13, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, 13, 1931, 3612 ; t10a, t13a
+ paddd m9, m20, m16 ; t8
+ psubd m20, m16, m20 ; t9
+ psubd m16, m22, m18 ; t10
+ paddd m18, m22 ; t11
+ paddd m22, m23, m19 ; t15
+ psubd m23, m19 ; t14
+ psubd m19, m17, m21 ; t13
+ paddd m17, m21 ; t12
+ vpbroadcastd m11, [o(pd_3784)]
+ REPX {pmaxsd x, m14}, m20, m23, m16, m19
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m20, m23, m16, m19
+ ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11
+ ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m9, m18, m22, m17
+ REPX {pminsd x, m15}, m9, m18, m22, m17
+ paddd m21, m20, m19 ; t14
+ psubd m20, m19 ; t13
+ psubd m19, m9, m18 ; t11a
+ paddd m9, m18 ; t8a
+ psubd m18, m23, m16 ; t10
+ paddd m16, m23 ; t9
+ psubd m23, m22, m17 ; t12a
+ paddd m22, m17 ; t15a
+ REPX {pmaxsd x, m14}, m20, m23, m18, m19
+ REPX {pminsd x, m15}, m20, m23, m18, m19
+ REPX {pmulld x, m12}, m20, m23, m18, m19
+ psubd m7, m0, m6 ; dct8 out7
+ paddd m0, m6 ; dct8 out0
+ psubd m6, m1, m5 ; dct8 out6
+ paddd m1, m5 ; dct8 out1
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1
+ psubd m5, m2, m4 ; dct8 out5
+ paddd m2, m4 ; dct8 out2
+ REPX {pminsd x, m15}, m7, m0, m6, m1
+ psubd m4, m3, m8 ; dct8 out4
+ paddd m3, m8 ; dct8 out3
+ REPX {pmaxsd x, m14}, m5, m2, m4, m3
+ paddd m20, m13
+ paddd m23, m13
+ REPX {pminsd x, m15}, m5, m2, m4, m3
+ psubd m17, m20, m18 ; t10a
+ paddd m20, m18 ; t13a
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ psubd m18, m23, m19 ; t11
+ paddd m19, m23 ; t12
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_2)]
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m23, m0, m22 ; out15
+ paddd m0, m22 ; out0
+ psubd m22, m1, m21 ; out14
+ paddd m1, m21 ; out1
+ psubd m21, m2, m20 ; out13
+ paddd m2, m20 ; out2
+ psubd m20, m3, m19 ; out12
+ paddd m3, m19 ; out3
+ psubd m19, m4, m18 ; out11
+ paddd m4, m18 ; out4
+ psubd m18, m5, m17 ; out10
+ paddd m5, m17 ; out5
+ psubd m17, m6, m16 ; out9
+ paddd m6, m16 ; out6
+ psubd m16, m7, m9 ; out8
+ paddd m7, m9 ; out7
+ REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \
+ m4, m20, m5, m21, m6, m22, m7, m23
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m20
+ packssdw m5, m21
+ packssdw m6, m22
+ packssdw m7, m23
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, flipadst
+INV_TXFM_16X16_FN adst, adst
+
+cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call .main_pass1
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m5, m20
+ packssdw m5, m6, m21
+ packssdw m6, m7, m22
+ packssdw m7, m8, m23
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call .main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_fast_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+.pass1_fast_end2:
+ mova m10, m9
+ mova m11, m8
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m10, [permC]
+ mova m13, [pw_2048_m2048]
+ psrlq m11, m10, 8
+ vpermq m8, m11, m0
+ vpermq m0, m10, m7
+ vpermq m7, m11, m1
+ vpermq m1, m10, m6
+ vpermq m6, m11, m2
+ vpermq m2, m10, m5
+ vpermq m5, m11, m3
+ vpermq m3, m10, m4
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+ALIGN function_align
+.main_pass1:
+ mova m0, [cq+64* 0]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m23, [cq+64*15]
+ vpbroadcastd m13, [o(pd_2048)]
+ ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0
+ mova m7, [cq+64* 7]
+ mova m16, [cq+64* 8]
+ ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8
+ mova m2, [cq+64* 2]
+ mova m21, [cq+64*13]
+ ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2
+ mova m5, [cq+64* 5]
+ mova m18, [cq+64*10]
+ ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10
+ mova m4, [cq+64* 4]
+ mova m19, [cq+64*11]
+ ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4
+ mova m3, [cq+64* 3]
+ mova m20, [cq+64*12]
+ ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12
+ mova m6, [cq+64* 6]
+ mova m17, [cq+64* 9]
+ ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6
+ mova m1, [cq+64* 1]
+ mova m22, [cq+64*14]
+ ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psubd m9, m23, m7 ; t9a
+ paddd m23, m7 ; t1a
+ psubd m7, m2, m18 ; t10a
+ paddd m18, m2 ; t2a
+ REPX {pmaxsd x, m14}, m9, m23, m7, m18
+ psubd m2, m17, m1 ; t15a
+ paddd m17, m1 ; t7a
+ REPX {pminsd x, m15}, m9, m23, m7, m18
+ psubd m1, m21, m5 ; t11a
+ paddd m21, m5 ; t3a
+ REPX {pmaxsd x, m14}, m2, m17, m1, m21
+ psubd m5, m4, m20 ; t12a
+ paddd m4, m20 ; t4a
+ REPX {pminsd x, m15}, m2, m17, m1, m21
+ psubd m20, m19, m3 ; t13a
+ paddd m19, m3 ; t5a
+ REPX {pmaxsd x, m14}, m5, m4, m20, m19
+ psubd m8, m6, m22 ; t14a
+ paddd m6, m22 ; t6a
+ REPX {pminsd x, m15}, m5, m4, m20, m19
+ psubd m22, m0, m16 ; t8a
+ paddd m16, m0 ; t0a
+ REPX {pmaxsd x, m14}, m8, m6, m22, m16
+ vpbroadcastd m11, [o(pd_4017)]
+ vpbroadcastd m10, [o(pd_799)]
+ REPX {pminsd x, m15}, m8, m6, m22, m16
+ ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8
+ ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13
+ vpbroadcastd m11, [o(pd_2276)]
+ vpbroadcastd m10, [o(pd_3406)]
+ ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10
+ ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15
+ paddd m0, m16, m4 ; t0
+ psubd m16, m4 ; t4
+ psubd m3, m23, m19 ; t5
+ paddd m23, m19 ; t1
+ REPX {pmaxsd x, m14}, m0, m16, m3, m23
+ psubd m19, m18, m6 ; t6
+ paddd m18, m6 ; t2
+ REPX {pminsd x, m15}, m0, m16, m3, m23
+ psubd m6, m21, m17 ; t7
+ paddd m21, m17 ; t3
+ REPX {pmaxsd x, m14}, m19, m18, m6, m21
+ paddd m17, m9, m20 ; t8a
+ psubd m9, m20 ; t12a
+ REPX {pminsd x, m15}, m19, m18, m6, m21
+ psubd m20, m22, m5 ; t13a
+ paddd m22, m5 ; t9a
+ REPX {pmaxsd x, m14}, m17, m9, m20, m22
+ psubd m5, m1, m2 ; t14a
+ paddd m1, m2 ; t10a
+ REPX {pminsd x, m15}, m17, m9, m20, m22
+ psubd m2, m7, m8 ; t15a
+ paddd m7, m8 ; t11a
+ REPX {pmaxsd x, m14}, m5, m1, m2, m7
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m5, m1, m2, m7
+ ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a
+ ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a
+ ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12
+ ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15
+ psubd m8, m0, m18 ; t2a
+ paddd m0, m18 ; out0
+ psubd m18, m23, m21 ; t3a
+ paddd m23, m21 ; -out15
+ paddd m21, m9, m5 ; -out13
+ psubd m9, m5 ; t15a
+ psubd m5, m3, m6 ; t6
+ paddd m3, m6 ; -out3
+ REPX {pmaxsd x, m14}, m8, m18, m9, m5
+ psubd m6, m20, m2 ; t14a
+ paddd m2, m20 ; out2
+ paddd m20, m16, m19 ; out12
+ psubd m16, m19 ; t7
+ REPX {pminsd x, m15}, m8, m18, m9, m5
+ psubd m19, m22, m7 ; t11
+ paddd m22, m7 ; out14
+ psubd m7, m17, m1 ; t10
+ paddd m1, m17 ; -out1
+ REPX {pmaxsd x, m14}, m6, m16, m19, m7
+ vpbroadcastd m12, [o(pd_1448)]
+ vpbroadcastd m4, [o(pd_2)]
+ vpbroadcastd m10, [o(pd_5120)]
+ vpbroadcastd m11, [o(pd_5119)]
+ REPX {pminsd x, m15}, m6, m16, m19, m7
+ psubd m17, m7, m19 ; -out9
+ paddd m7, m19 ; out6
+ psubd m19, m5, m16 ; -out11
+ paddd m5, m16 ; out4
+ REPX {pmulld x, m12}, m17, m7, m19, m5
+ psubd m16, m8, m18 ; out8
+ paddd m8, m18 ; -out7
+ psubd m18, m6, m9 ; out10
+ paddd m6, m9 ; -out5
+ REPX {pmulld x, m12}, m16, m8, m18, m6
+ REPX {paddd x, m4 }, m0, m2, m20, m22
+ REPX {psubd x, m4, x}, m1, m3, m21, m23
+ REPX {paddd x, m10 }, m7, m5, m16, m18
+ REPX {psubd x, m11, x}, m17, m19, m8, m6
+ REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3
+ REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8
+ ret
+ALIGN function_align
+.main_pass1_fast:
+ mova ym0, [cq+64*0]
+ mova ym1, [cq+64*2]
+ movshdup m8, [o(permB)]
+ mova ym6, [cq+64*1]
+ mova ym7, [cq+64*3]
+ mova ym2, [cq+64*4]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*5]
+ mova ym5, [cq+64*7]
+ vpermt2q m0, m8, m1 ; 0 2
+ vpermt2q m7, m8, m6 ; 3 1
+ vpermt2q m2, m8, m3 ; 4 6
+ vpermt2q m5, m8, m4 ; 7 5
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ jmp m(iadst_16x8_internal_10bpc).main_fast
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call m(iadst_16x16_internal_10bpc).main_pass1
+ packssdw m4, m19, m3
+ packssdw m3, m20, m5
+ packssdw m5, m18, m2
+ packssdw m2, m21, m6
+ packssdw m6, m17, m1
+ packssdw m1, m22, m7
+ packssdw m7, m16, m0
+ packssdw m0, m23, m8
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_16x16_internal_10bpc).main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m10, [permC]
+ movu m13, [pw_m2048_2048]
+ psrlq m11, m10, 8
+ vpermq m8, m11, m7
+ vpermq m7, m11, m6
+ vpermq m6, m11, m5
+ vpermq m5, m11, m4
+ vpermq m3, m10, m3
+ vpermq m2, m10, m2
+ vpermq m1, m10, m1
+ vpermq m0, m10, m0
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+
+INV_TXFM_16X16_FN identity, dct, -92
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m10, [o(pd_5793)]
+ vpbroadcastd m11, [o(pd_5120)]
+ mov r6, cq
+ cmp eobd, 36
+ jl .fast
+ call .pass1_main
+ packssdw m0, m6, m8
+ packssdw m1, m7, m9
+ call .pass1_main
+ packssdw m2, m6, m8
+ packssdw m3, m7, m9
+ call .pass1_main
+ packssdw m4, m6, m8
+ packssdw m5, m7, m9
+ call .pass1_main
+ packssdw m6, m8
+ packssdw m7, m9
+ jmp m(idct_16x16_internal_10bpc).pass1_end2
+.fast:
+ call .pass1_main_fast
+ packssdw m0, m6, m7
+ call .pass1_main_fast
+ packssdw m1, m6, m7
+ call .pass1_main_fast
+ packssdw m2, m6, m7
+ call .pass1_main_fast
+ packssdw m3, m6, m7
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckldq m3, m4, m1
+ punpckhdq m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ pxor m7, m7
+ vshufi32x4 m2, m0, m3, q3131
+ vshufi32x4 m0, m3, q2020
+ vshufi32x4 m3, m1, m4, q3131
+ vshufi32x4 m1, m4, q2020
+ REPX {mova x, m7}, m4, m5, m6
+ jmp m(idct_16x16_internal_10bpc).pass1_end3
+.pass2:
+ movshdup m11, [o(permC)]
+ vpbroadcastd m12, [o(pw_1697x16)]
+ lea r6, [strideq*3]
+ vpbroadcastd m13, [o(pw_2048)]
+ pxor m14, m14
+ vpbroadcastd m15, [pixel_10bpc_max]
+ vpermq m8, m11, m0
+ vpermq m9, m11, m1
+ call .pass2_main
+ vpermq m8, m11, m2
+ vpermq m9, m11, m3
+ call .pass2_main
+ vpermq m8, m11, m4
+ vpermq m9, m11, m5
+ call .pass2_main
+ vpermq m8, m11, m6
+ vpermq m9, m11, m7
+.pass2_main:
+ pmulhrsw m0, m12, m8
+ pmulhrsw m1, m12, m9
+ paddsw m8, m8
+ paddsw m9, m9
+ paddsw m8, m0
+ paddsw m9, m1
+ pmulhrsw m8, m13
+ pmulhrsw m9, m13
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+ALIGN function_align
+.pass1_main:
+ pmulld m6, m10, [r6+64*0]
+ pmulld m7, m10, [r6+64*1]
+ pmulld m8, m10, [r6+64*8]
+ pmulld m9, m10, [r6+64*9]
+ add r6, 64*2
+ REPX {paddd x, m11}, m6, m7, m8, m9
+ REPX {psrad x, 13 }, m6, m8, m7, m9
+ ret
+ALIGN function_align
+.pass1_main_fast:
+ mova ym6, [r6+64* 0]
+ vinserti32x8 m6, [r6+64* 4], 1
+ mova ym7, [r6+64* 8]
+ vinserti32x8 m7, [r6+64*12], 1
+ add r6, 64
+ REPX {pmulld x, m10}, m6, m7
+ REPX {paddd x, m11}, m6, m7
+ REPX {psrad x, 13 }, m6, m7
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ vpbroadcastd m11, [o(pd_2)]
+ mova m20, [o(idct8x32p)]
+ pxor m21, m21
+ cmp eobd, 43
+ jl .fast
+ call .pass1_main
+ punpcklwd m16, m0, m1
+ punpcklwd m17, m2, m3
+ punpckhwd m18, m0, m1
+ punpckhwd m19, m2, m3
+ cmp eobd, 107
+ jge .full
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ call m(idct_8x16_internal_8bpc).main_fast
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .end
+.full:
+ add cq, 64
+ call .pass1_main
+ punpcklwd m5, m0, m1
+ punpcklwd m6, m2, m3
+ punpckhwd m7, m0, m1
+ punpckhwd m8, m2, m3
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ punpckldq m4, m5, m6 ; 16 18
+ punpckhdq m5, m6 ; 20 22
+ punpckldq m6, m7, m8 ; 24 26
+ punpckhdq m7, m8 ; 28 30
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ vextracti32x8 ym18, m4, 1
+ vextracti32x8 ym19, m5, 1
+ vextracti32x8 ym20, m6, 1
+ vextracti32x8 ym21, m7, 1
+ call m(idct_8x16_internal_8bpc).main
+ REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+ jmp .end
+.fast:
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*1]
+ mova ym5, [cq+128*5]
+ mova ym7, [cq+128*3]
+ mova ym3, [cq+128*7]
+ mova ym0, [cq+128*0]
+ mova ym4, [cq+128*2]
+ mova ym2, [cq+128*4]
+ mova ym6, [cq+128*6]
+ vpermt2q m1, m8, m5 ; 1 5
+ vpermt2q m3, m8, m7 ; 7 3
+ vpermt2q m0, m8, m4 ; 0 2
+ vpermt2q m2, m8, m6 ; 4 6
+ mova [cq+128*0], ym21
+ REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ packssdw m0, m2
+ packssdw m1, m3
+ vpermb m0, m20, m0
+ vprold m20, 16
+ vpermb m2, m20, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ call m(idct_8x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2
+.end:
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper
+ lea r3, [strideq*2]
+ vpbroadcastd m12, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m11, m11
+ lea r3, [dstq+r3*8]
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ call .write_8x4x2
+ pmulhrsw m0, m10, m2
+ pmulhrsw m1, m10, m3
+ call .write_8x4x2
+ pmulhrsw m0, m10, m4
+ pmulhrsw m1, m10, m5
+ call .write_8x4x2
+ pmulhrsw m0, m10, m6
+ pmulhrsw m1, m10, m7
+.write_8x4x2:
+ mova xm8, [dstq+strideq*0]
+ vinserti32x4 ym8, [dstq+strideq*1], 1
+ vinserti32x4 m8, [dstq+strideq*2], 2
+ vinserti32x4 m8, [dstq+r6 ], 3
+ mova xm9, [r3 +r6 ]
+ vinserti32x4 ym9, [r3 +strideq*2], 1
+ vinserti32x4 m9, [r3 +strideq*1], 2
+ vinserti32x4 m9, [r3 +strideq*0], 3
+ paddw m8, m0
+ paddw m9, m1
+ pmaxsw m8, m11
+ pmaxsw m9, m11
+ pminsw m8, m12
+ pminsw m9, m12
+ mova [dstq+strideq*0], xm8
+ vextracti32x4 [dstq+strideq*1], ym8, 1
+ vextracti32x4 [dstq+strideq*2], m8, 2
+ vextracti32x4 [dstq+r6 ], m8, 3
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 [r3 +strideq*0], m9, 3
+ vextracti32x4 [r3 +strideq*1], m9, 2
+ vextracti32x4 [r3 +strideq*2], ym9, 1
+ mova [r3 +r6 ], xm9
+ lea r3, [r3+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+ALIGN function_align
+.pass1_main:
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x16_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_end2
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ REPX {vpermb x, m20, x}, m0, m1, m2, m3
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob
+ vpbroadcastd m9, [pw_5]
+ lea r4, [strideq*3]
+ pxor m10, m10
+ lea r5, [strideq*5]
+ vpbroadcastd m11, [pixel_10bpc_max]
+ sub eobd, 107
+ lea r6, [strideq+r4*2]
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ lea r7, [dstq+strideq*8]
+ REPX {mova [cq+128*x], m10}, 0, 1, 2, 3
+ REPX {paddsw x, m9}, m0, m1, m2, m3
+ REPX {mova [cq+128*x], m10}, 4, 5, 6, 7
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ add cq, 64
+ mova xm4, [dstq+strideq*0]
+ mova xm5, [dstq+strideq*1]
+ mova xm6, [dstq+strideq*2]
+ mova xm7, [dstq+r4 *1]
+ punpckhwd m8, m0, m1
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ punpcklwd m0, m1
+ vinserti32x4 ym5, [dstq+r5 *1], 1
+ punpckhwd m1, m2, m3
+ vinserti32x4 ym6, [dstq+r4 *2], 1
+ punpcklwd m2, m3
+ vinserti32x4 ym7, [dstq+r6 *1], 1
+ punpckhwd m3, m0, m8
+ vinserti32x4 m4, [r7 +strideq*0], 2
+ punpcklwd m0, m8
+ vinserti32x4 m5, [r7 +strideq*1], 2
+ punpckhwd m8, m2, m1
+ vinserti32x4 m6, [r7 +strideq*2], 2
+ punpcklwd m2, m1
+ vinserti32x4 m7, [r7 +r4 *1], 2
+ punpckhqdq m1, m0, m2
+ vinserti32x4 m4, [r7 +strideq*4], 3
+ punpcklqdq m0, m2
+ vinserti32x4 m5, [r7 +r5 *1], 3
+ punpcklqdq m2, m3, m8
+ vinserti32x4 m6, [r7 +r4 *2], 3
+ punpckhqdq m3, m8
+ vinserti32x4 m7, [r7 +r6 *1], 3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ REPX {pmaxsw x, m10}, m0, m1, m2, m3
+ REPX {pminsw x, m11}, m0, m1, m2, m3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ mova [dstq+strideq*2], xm2
+ mova [dstq+r4 *1], xm3
+ vextracti32x4 [dstq+strideq*4], ym0, 1
+ vextracti32x4 [dstq+r5 *1], ym1, 1
+ vextracti32x4 [dstq+r4 *2], ym2, 1
+ vextracti32x4 [dstq+r6 *1], ym3, 1
+ lea dstq, [r7+strideq*8]
+ vextracti32x4 [r7 +strideq*0], m0, 2
+ vextracti32x4 [r7 +strideq*1], m1, 2
+ vextracti32x4 [r7 +strideq*2], m2, 2
+ vextracti32x4 [r7 +r4 *1], m3, 2
+ vextracti32x4 [r7 +strideq*4], m0, 3
+ vextracti32x4 [r7 +r5 *1], m1, 3
+ vextracti32x4 [r7 +r4 *2], m2, 3
+ vextracti32x4 [r7 +r6 *1], m3, 3
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ mova m11, [o(permB)]
+ mova m0, [cq+64* 0] ; 0 1
+ mova m4, [cq+64* 1] ; 2 3
+ mova m1, [cq+64* 2] ; 4 5
+ mova m8, [cq+64* 3] ; 6 7
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m10, m11, 32
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m16, m11
+ vpermi2q m16, m0, m1 ; 1 5
+ mova m17, m11
+ vpermi2q m17, m8, m4 ; 7 3
+ cmp eobd, 43
+ jl .fast
+ mova m18, [cq+64* 4] ; 8 9
+ mova m20, [cq+64* 5] ; 10 11
+ mova m6, [cq+64* 6] ; 12 13
+ mova m7, [cq+64* 7] ; 14 15
+ vpermt2q m0, m10, m18 ; 0 8
+ vpermt2q m18, m11, m6 ; 9 13
+ mova m19, m11
+ vpermi2q m19, m7, m20 ; 15 11
+ cmp eobd, 107
+ jge .full
+ vpermt2q m1, m10, m6 ; 4 12
+ vpermt2q m4, m10, m8 ; 2 6
+ vpermt2q m7, m10, m20 ; 14 10
+ mov r6d, 64*1
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ call .main_fast
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.full:
+ mova m2, [cq+64* 8] ; 16 17
+ mova m5, [cq+64* 9] ; 18 19
+ mova m9, [cq+64*10] ; 20 21
+ mova m21, [cq+64*11] ; 22 23
+ vpermt2q m1, m10, m9 ; 4 20
+ vpermt2q m7, m10, m21 ; 14 22
+ vpermt2q m21, m11, m5 ; 23 19
+ vpermt2q m5, m10, m20 ; 18 10
+ mova m20, m11
+ vpermi2q m20, m2, m9 ; 17 21
+ mova m22, [cq+64*12] ; 24 25
+ mova m9, [cq+64*13] ; 26 27
+ mova m3, [cq+64*14] ; 28 29
+ mova m23, [cq+64*15] ; 30 31
+ vpermt2q m2, m10, m22 ; 16 24
+ vpermt2q m22, m11, m3 ; 25 29
+ vpermt2q m3, m10, m6 ; 28 12
+ vpermt2q m4, m10, m9 ; 2 26
+ mova m6, m10
+ vpermi2q m6, m23, m8 ; 30 6
+ vpermt2q m23, m11, m9 ; 31 27
+ mov r6d, 64*3
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_16x8_internal_10bpc).main
+ call .main
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.fast:
+ vpermq m0, m10, m0 ; 0 0
+ vpermq m1, m10, m1 ; 4 4
+ vpermt2q m4, m10, m8 ; 2 6
+ xor r6d, r6d
+ call m(idct_8x8_internal_10bpc).main_fast2
+ call m(idct_16x8_internal_10bpc).main_fast2
+ call .main_fast2
+ call m(idct_16x16_internal_10bpc).main_end
+.end:
+ mova m10, [o(idct32x8p)]
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+ psrlw m8, m10, 8
+ mova m9, m8
+ vpermi2w m8, m1, m5
+ vpermt2w m1, m10, m5
+ vprold m5, m9, 16
+ vpermi2w m9, m3, m7
+ vpermt2w m3, m10, m7
+ vprold m10, 16
+ mova m7, m5
+ vpermi2w m5, m0, m4
+ vpermt2w m0, m10, m4
+ pxor m14, m14
+ vpermi2w m7, m2, m6
+ vpermt2w m2, m10, m6
+.zero_loop:
+ mova [cq+r6*4+64*3], m14
+ mova [cq+r6*4+64*2], m14
+ mova [cq+r6*4+64*1], m14
+ mova [cq+r6*4+64*0], m14
+ sub r6d, 64
+ jge .zero_loop
+ punpckhdq m6, m5, m8
+ punpckldq m5, m8
+ punpckhdq m8, m7, m9
+ punpckldq m7, m9
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpckhdq m3, m0, m1
+ punpckldq m0, m1
+ vpbroadcastd m13, [o(pw_2048)]
+ vpbroadcastd m15, [o(pixel_10bpc_max)]
+ lea r5, [o_base_8bpc]
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ lea r6, [strideq*3]
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ pmulhrsw m2, m13
+ pmulhrsw m3, m13
+ call .write_32x4
+ pmulhrsw m0, m13, m4
+ pmulhrsw m1, m13, m5
+ pmulhrsw m2, m13, m6
+ pmulhrsw m3, m13, m7
+.write_32x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r6 ]
+ REPX {pmaxsw x, m14}, m0, m1, m2, m3
+ REPX {pminsw x, m15}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r6 ], m3
+ lea dstq, [dstq+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [o(dconly_10bpc)]
+ mov [cq], eobd
+ or r3d, 8
+ add r6d, 640
+ sar r6d, 10
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m2, r6d
+ paddsw m2, m3
+.dconly_loop:
+ paddsw m0, m2, [dstq+strideq*0]
+ paddsw m1, m2, [dstq+strideq*1]
+ psubusw m0, m3
+ psubusw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+ALIGN function_align
+.main_fast2: ; bottom three-quarters are zero
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m9, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16 t20
+ pmulld m16, m7 ; t31 t27
+ pmulld m22, m17 ; -t19 -t25
+ pmulld m17, m9 ; t28 t24
+ REPX {paddd x, m13}, m23, m16, m17
+ psubd m22, m13, m22
+ REPX {psrad x, 12 }, m23, m16, m22, m17
+ mova m20, m23 ; t30 t26
+ mova m9, m16 ; t17 t21
+ mova m19, m22 ; t18 t22
+ mova m18, m17 ; t29 t25
+ jmp .main3
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m20, [o(pd_2751_2106)]
+ vbroadcasti32x4 m9, [o(pd_3035_3513)]
+ vbroadcasti32x4 m21, [o(pd_3703_3290)]
+ vbroadcasti32x4 m10, [o(pd_1751_2440)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16a t20a
+ pmulld m16, m7 ; t31a t27a
+ pmulld m20, m19 ; -t17a -t21a
+ pmulld m19, m9 ; t30a t26a
+ pmulld m21, m18 ; t18a t22a
+ pmulld m18, m10 ; t29a t25a
+ pmulld m22, m17 ; -t19a -t25a
+ pmulld m17, m11 ; t28a t24a
+ psubd m20, m13, m20
+ psubd m22, m13, m22
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601
+ paddd m20, m13
+ paddd m22, m13
+.main2:
+ REPX {paddd x, m13}, m16, m23, m19
+ REPX {psrad x, 12 }, m16, m20, m23, m19
+ psubd m9, m16, m20 ; t17 t21
+ paddd m16, m20 ; t16 t20
+ psubd m20, m23, m19 ; t30 t26
+ paddd m23, m19 ; t31 t27
+ REPX {pmaxsd x, m14}, m9, m16, m20, m23
+ REPX {paddd x, m13}, m21, m18, m17
+ REPX {psrad x, 12 }, m18, m22, m21, m17
+ psubd m19, m22, m18 ; t18 t22
+ paddd m22, m18 ; t19 t23
+ psubd m18, m17, m21 ; t29 t25
+ paddd m17, m21 ; t28 t24
+ REPX {pmaxsd x, m14}, m19, m22, m18, m17
+ REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17
+.main3:
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ psubd m7, m0, m6 ; dct16 out15 out14
+ paddd m0, m6 ; dct16 out0 out1
+ psubd m6, m1, m5 ; dct16 out12 out13
+ paddd m1, m5 ; dct16 out3 out2
+ psubd m5, m2, m4 ; dct16 out11 out10
+ paddd m2, m4 ; dct16 out4 out5
+ psubd m4, m3, m8 ; dct16 out8 out9
+ paddd m3, m8 ; dct16 out7 out6
+ ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11
+ ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
+ punpckhqdq m21, m16, m20 ; t20 t21a
+ punpcklqdq m16, m20 ; t16 t17a
+ punpcklqdq m20, m22, m19 ; t19 t18a
+ punpckhqdq m22, m19 ; t23 t22a
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ punpcklqdq m19, m23, m9 ; t31 t30a
+ punpckhqdq m23, m9 ; t27 t26a
+ punpckhqdq m9, m17, m18 ; t24 t25a
+ punpcklqdq m17, m18 ; t28 t29a
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ psubd m18, m16, m20 ; t19a t18
+ paddd m20, m16 ; t16a t17
+ psubd m16, m19, m17 ; t28a t29
+ paddd m19, m17 ; t31a t30
+ psubd m17, m22, m21 ; t20a t21
+ paddd m22, m21 ; t23a t22
+ psubd m21, m9, m23 ; t27a t26
+ paddd m23, m9 ; t24a t25
+ REPX {pmaxsd x, m14}, m18, m16, m17, m21
+ REPX {pminsd x, m15}, m16, m18, m21, m17
+ ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m20, m22, m19, m23
+ REPX {pminsd x, m15}, m20, m22, m19, m23
+ paddd m9, m20, m22 ; t16 t17a
+ psubd m20, m22 ; t23 t22a
+ paddd m22, m19, m23 ; t31 t30a
+ psubd m19, m23 ; t24 t25a
+ psubd m23, m16, m17 ; t20a t21
+ paddd m16, m17 ; t19a t18
+ psubd m17, m18, m21 ; t27a t26
+ paddd m21, m18 ; t28a t29
+ REPX {pmaxsd x, m14}, m20, m19, m23, m17
+ REPX {pminsd x, m15}, m19, m20, m17, m23
+ REPX {pmulld x, m12}, m19, m20, m17, m23
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ paddd m19, m13
+ paddd m17, m13
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ psubd m18, m19, m20 ; t23a t22
+ paddd m19, m20 ; t24a t25
+ paddd m20, m17, m23 ; t27 t26a
+ psubd m17, m23 ; t20 t21a
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob
+ vpbroadcastd m5, [pw_4096]
+ lea r4, [strideq*3]
+ mova m6, [idtx32x8p]
+ lea r5, [strideq*5]
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r6, [strideq+r4*2]
+ pxor m8, m8
+ sub eobd, 107
+ psrlw m7, m6, 8
+.loop:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1] ; 02 13
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3] ; 46 57
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5] ; 8a 9b
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7] ; ce df
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {mova [cq+64*x], m8}, 0, 1, 2, 3
+ mova m4, m6
+ vpermi2w m4, m1, m3
+ vpermt2w m1, m7, m3
+ REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
+ mova m3, m7
+ vpermi2w m3, m0, m2
+ vpermt2w m0, m6, m2
+ add cq, 64*8
+ punpcklqdq m2, m3, m1 ; 4 5
+ punpckhqdq m3, m1 ; 6 7
+ punpckhqdq m1, m0, m4 ; 2 3
+ punpcklqdq m0, m4 ; 0 1
+ mova ym4, [dstq+strideq*0]
+ vinserti32x8 m4, [dstq+strideq*1], 1
+ paddw m0, m4
+ mova ym4, [dstq+strideq*2]
+ vinserti32x8 m4, [dstq+r4 *1], 1
+ paddw m1, m4
+ mova ym4, [dstq+strideq*4]
+ vinserti32x8 m4, [dstq+r5 *1], 1
+ paddw m2, m4
+ mova ym4, [dstq+r4 *2]
+ vinserti32x8 m4, [dstq+r6 *1], 1
+ paddw m3, m4
+ REPX {pmaxsw x, m8}, m0, m1, m2, m3
+ REPX {pminsw x, m9}, m0, m1, m2, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+r4 *1], m1, 1
+ mova [dstq+strideq*4], ym2
+ vextracti32x8 [dstq+r5 *1], m2, 1
+ mova [dstq+r4 *2], ym3
+ vextracti32x8 [dstq+r6 *1], m3, 1
+ add dstq, 32
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
index 4fb30ef4e7a..3833e17c99f 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
@@ -361,18 +361,32 @@ ALIGN function_align
%macro INV_TXFM_4X4_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x4
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
- movd m1, [o(pw_2896x8)]
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
+ mov r3d, 4
+.dconly:
+ add r5d, 128
+ sar r5d, 8
+.dconly2:
+ imul r5d, 2896
+ mova m2, [o(pixel_10bpc_max)]
+ add r5d, 34816
movd m0, r5d
- packssdw m0, m0
- pmulhrsw m0, m1
- pshuflw m0, m0, q0000
+ pshuflw m0, m0, q1111
+ pxor m3, m3
punpcklqdq m0, m0
- mova m1, m0
- TAIL_CALL m(iadst_4x4_internal_16bpc).end
+.dconly_loop:
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ paddw m1, m0
+ pminsw m1, m2
+ pmaxsw m1, m3
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
%endif
%endmacro
@@ -662,40 +676,13 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, %3, 4x8
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 2
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 2048
- sar r5d, 12
-.end:
- imul r5d, 2896
- add r5d, 34816
- movd m0, r5d
- pshuflw m0, m0, q1111
- punpcklqdq m0, m0
- pxor m4, m4
- mova m3, [o(pixel_10bpc_max)]
- lea r2, [strideq*3]
-.loop:
- movq m1, [dstq+strideq*0]
- movq m2, [dstq+strideq*2]
- movhps m1, [dstq+strideq*1]
- movhps m2, [dstq+r2]
- paddw m1, m0
- paddw m2, m0
- REPX {pminsw x, m3}, m1, m2
- REPX {pmaxsw x, m4}, m1, m2
- movq [dstq+strideq*0], m1
- movhps [dstq+strideq*1], m1
- movq [dstq+strideq*2], m2
- movhps [dstq+r2 ], m2
- lea dstq, [dstq+strideq*4]
- dec r3d
- jg .loop
- RET
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
%endif
%endmacro
@@ -944,12 +931,12 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 4
- add r5d, 6144
- sar r5d, 13
- jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end
+ mov r3d, 16
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
%endif
%endmacro
@@ -1297,13 +1284,13 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 2048
- sar r5d, 12
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 128
+ sar r5d, 8
imul r5d, 2896
add r5d, 34816
movd m0, r5d
@@ -1783,12 +1770,12 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 2
.end:
- add r5d, 6144
- sar r5d, 13
+ add r5d, 384
+ sar r5d, 9
.end2:
imul r5d, 2896
add r5d, 34816
@@ -2146,11 +2133,11 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
mov r3d, 4
%if stack_size_padded > 0
; adjust to caller's stack allocation
@@ -2477,12 +2464,12 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 4
.dconly:
- add r5d, 6144
- sar r5d, 13
+ add r5d, 384
+ sar r5d, 9
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -2755,6 +2742,8 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
ret
.round:
%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
pcmpeqd m8, m8
REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
@@ -2785,6 +2774,14 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
pcmpeqd m0, m0
REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
mova [r3+ 1*16], m1
@@ -3472,12 +3469,12 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
%if ARCH_X86_32
add rsp, 1*16
%endif
@@ -3939,11 +3936,11 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (5+ARCH_X86_64*3+WIN64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
%endif
@@ -4057,6 +4054,8 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
ret
.round:
%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
psrld m8, m11, 10 ; 2
REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
@@ -4087,6 +4086,14 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
mova m0, [o(pd_2)]
REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
paddd m0, [r3+ 0*16]
@@ -5162,11 +5169,11 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (31+2*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
@@ -5339,12 +5346,12 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
%endif
RET
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
add rsp, (65+4*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
@@ -5944,6 +5951,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
; final sumsub for idct16 as well as idct32, plus final downshift
%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
mova m%4, [r3+16*(23-%1)]
+ pmaxsd m%1, m12
+ pminsd m%1, m13
psubd m%3, m%1, m%4 ; idct16 out15 - n
paddd m%1, m%4 ; idct16 out0 + n
pmaxsd m%1, m12
@@ -6019,6 +6028,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
.loop_dct32_end:
mova m0, [r3+16*16]
mova m6, [r3+16*24]
+ pmaxsd m0, m2
+ pminsd m0, m3
psubd m5, m0, m6 ; idct16 out15 - n
paddd m0, m6 ; idct16 out0 + n
pmaxsd m0, m2
@@ -6045,12 +6056,12 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
.dconly1:
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -6344,14 +6355,14 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
@@ -6565,7 +6576,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
@@ -6838,11 +6849,11 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
@@ -7098,14 +7109,14 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
@@ -7537,6 +7548,8 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
mova m5, [r3-16* 4] ; idct64 48 + n
mova m6, [r4-16*20] ; idct64 47 - n
mova m7, [r3-16*20] ; idct64 32 + n
+ pmaxsd m0, m12
+ pminsd m0, m13
paddd m8, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
REPX {pmaxsd x, m12}, m8, m0
@@ -7565,11 +7578,13 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
mova [r4-16* 4], m6
mova [r3+16*12], m8
%else
+ mova m5, [o(clip_18b_min)]
+ mova m6, [o(clip_18b_max)]
mova m1, [r3+16*44] ; idct16 15 - n
+ pmaxsd m0, m5
+ pminsd m0, m6
paddd m4, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
- mova m5, [o(clip_18b_min)]
- mova m6, [o(clip_18b_max)]
REPX {pmaxsd x, m5}, m4, m0
REPX {pminsd x, m6}, m4, m0
paddd m1, m4, m3 ; idct32 out0 + n
@@ -7632,12 +7647,12 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
.dconly1:
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -7876,14 +7891,14 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
add rsp, (1+8*32+1*WIN64)*16
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
@@ -8112,7 +8127,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm
index 092c842786d..a67f053a61b 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm
@@ -126,7 +126,7 @@ pw_m2751_3035x8: dw -2751*8, 3035*8
SECTION .text
-; Code size reduction trickery: Intead of using rip-relative loads with
+; Code size reduction trickery: Instead of using rip-relative loads with
; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
; single rip-relative lea and then address things relative from that with
; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
@@ -1194,13 +1194,9 @@ cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
+ mov [cq], eobd
pmulhrsw xm0, xm1
- movd xm2, [o(pw_2048)]
- pmulhrsw xm0, xm1
- pmulhrsw xm0, xm2
- vpbroadcastw m0, xm0
- mova m1, m0
- jmp m(iadst_8x4_internal_8bpc).end3
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
%endif
%endmacro
@@ -1340,20 +1336,20 @@ cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
+ or r3d, 8
+.dconly:
pmulhrsw xm0, xm2
- psrlw xm2, 3 ; pw_2048
+.dconly2:
+ movd xm2, [pw_2048]
pmulhrsw xm0, xm1
+ lea r2, [strideq*3]
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
-.end:
- mov r2d, 2
-.end2:
- lea r3, [strideq*3]
-.loop:
- WRITE_8X4 0, 0, 1, 2
+.dconly_loop:
+ WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2
lea dstq, [dstq+strideq*4]
- dec r2d
- jg .loop
+ sub r3d, 4
+ jg .dconly_loop
RET
%endif
%endmacro
@@ -1543,13 +1539,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- pmulhrsw xm0, xm2
- psrlw xm2, 3 ; pw_2048
- pmulhrsw xm0, xm1
- pmulhrsw xm0, xm2
- vpbroadcastw m0, xm0
- mov r2d, 4
- jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
%endif
%endmacro
@@ -1902,7 +1893,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
- mov r2d, 2
+ or r3d, 4
.dconly:
pmulhrsw xm0, xm2
movd xm2, [pw_2048] ; intentionally rip-relative
@@ -1911,17 +1902,17 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
vpbroadcastw m0, xm0
pxor m3, m3
.dconly_loop:
- mova xm1, [dstq]
- vinserti128 m1, [dstq+strideq], 1
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
punpckhbw m2, m1, m3
punpcklbw m1, m3
paddw m2, m0
paddw m1, m0
packuswb m1, m2
- mova [dstq], xm1
- vextracti128 [dstq+strideq], m1, 1
+ mova [dstq+strideq*0], xm1
+ vextracti128 [dstq+strideq*1], m1, 1
lea dstq, [dstq+strideq*2]
- dec r2d
+ sub r3d, 2
jg .dconly_loop
RET
%endif
@@ -2162,7 +2153,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 4
+ or r3d, 8
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
%endif
%endmacro
@@ -2473,7 +2464,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 8
+ or r3d, 16
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
%endif
%endmacro
@@ -3120,13 +3111,8 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- pmulhrsw xm0, xm2
- psrlw xm2, 2 ; pw_2048
- pmulhrsw xm0, xm1
- pmulhrsw xm0, xm2
- vpbroadcastw m0, xm0
- mov r2d, 8
- jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
.full:
REPX {pmulhrsw x, m9}, m12, m13, m14, m15
pmulhrsw m6, m9, [rsp+32*2]
@@ -3290,7 +3276,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 8
+ or r3d, 8
.dconly:
pmulhrsw xm0, xm2
movd xm2, [pw_2048] ; intentionally rip-relative
@@ -3307,7 +3293,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
packuswb m1, m2
mova [dstq], m1
add dstq, strideq
- dec r2d
+ dec r3d
jg .dconly_loop
RET
.normal:
@@ -3672,7 +3658,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 16
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
.full:
mova [tmp1q-32*4], m1
@@ -3991,7 +3977,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 16
+ or r3d, 16
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
.normal:
PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
@@ -4222,7 +4208,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
.normal:
PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
@@ -4486,7 +4472,7 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 32
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
.normal:
PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
@@ -4832,7 +4818,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 16
+ or r3d, 16
.dconly:
pmulhrsw xm0, xm2
movd xm2, [o(pw_2048)]
@@ -4856,7 +4842,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
mova [dstq+32*0], m2
mova [dstq+32*1], m3
add dstq, strideq
- dec r2d
+ dec r3d
jg .dconly_loop
RET
.normal:
@@ -4997,7 +4983,7 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 64
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
.normal:
PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
@@ -5200,7 +5186,7 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
.normal:
PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
@@ -5381,7 +5367,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 64
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
.normal:
PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm
index 7d01bccb4f5..f30f4909287 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm
@@ -29,7 +29,8 @@
%if ARCH_X86_64
SECTION_RODATA 64
-int8_permA: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+const \
+int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
@@ -84,7 +85,7 @@ pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11
gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13
gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10
-gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10
+gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16
int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
@@ -845,7 +846,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
punpcklwd m3, m5 ; dct8 in3 in5
punpckhwd m5, m2 ; dct16 in11 in5
punpcklwd m6, m2 ; dct4 in3 in1
-.main2:
+cglobal_label .main2
vpbroadcastd m10, [o(pd_2048)]
.main3:
vpbroadcastq m13, [o(int_mshift)]
@@ -1355,7 +1356,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
vpermq m3, m3, q2031
jmp m(iadst_8x8_internal_8bpc).end2
ALIGN function_align
-.main:
+cglobal_label .main
IDCT8_1D_PACKED
ret
@@ -1422,7 +1423,7 @@ ALIGN function_align
punpckhqdq m0, m4 ; out0 -out1
ret
ALIGN function_align
-.main_pass2:
+cglobal_label .main_pass2
IADST8_1D_PACKED 2
ret
@@ -1499,8 +1500,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
%ifidn %1_%2, dct_dct
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 16
imul r6d, 181
- mov r3d, 16
add r6d, 128
sar r6d, 8
jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
@@ -1608,7 +1609,54 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
vpscatterdq [r3+ym8]{k2}, m2
RET
ALIGN function_align
-.main:
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ vpbroadcastd ym3, [o(pw_401_4076x8)]
+ vpbroadcastd ym5, [o(pw_799_4017x8)]
+ vpbroadcastd ym4, [o(pw_m1189_3920x8)]
+ pxor ym6, ym6
+ punpckhwd ym2, ym0, ym0
+ pmulhrsw ym2, ym3 ; t8a t15a
+ punpcklwd ym7, ym1, ym1
+ pmulhrsw ym7, ym5 ; t4a t7a
+ punpckhwd ym1, ym1
+ pmulhrsw ym4, ym1 ; t11a t12a
+ vpcmpub k7, ym13, ym10, 6
+ punpcklwd ym9, ym6, ym0
+ psubsw ym0, ym2, ym4 ; t11a t12a
+ paddsw ym8, ym2, ym4 ; t8a t15a
+ mova ym1, ym7
+ jmp .main5
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ pxor ym6, ym6
+ punpckhwd ym8, ym0, ym0
+ punpckhwd ym4, ym3, ym3
+ punpckhwd ym5, ym2, ym2
+ punpcklwd ym7, ym1, ym1
+ punpckhwd ym1, ym1
+ punpcklwd ym3, ym3
+ punpcklwd ym9, ym6, ym0
+ punpcklwd ym6, ym2
+ vpbroadcastd ym2, [o(pw_401_4076x8)]
+ vpbroadcastd ym0, [o(pw_m2598_3166x8)]
+ vpbroadcastd ym11, [o(pw_1931_3612x8)]
+ vpbroadcastd ym12, [o(pw_m1189_3920x8)]
+ pmulhrsw ym8, ym2 ; t8a t15a
+ vpbroadcastd ym2, [o(pw_799_4017x8)]
+ pmulhrsw ym0, ym4 ; t9a t14a
+ vpbroadcastd ym4, [o(pw_m2276_3406x8)]
+ pmulhrsw ym5, ym11 ; t10a t13a
+ pmulhrsw ym1, ym12 ; t11a t12a
+ pmulhrsw ym7, ym2 ; t4a t7a
+ pmulhrsw ym3, ym4 ; t5a t6a
+ vpcmpub k7, ym13, ym10, 6
+ jmp .main4
+ALIGN function_align
+cglobal_label .main
WRAP_YMM IDCT16_1D_PACKED
ret
@@ -1685,13 +1733,14 @@ ALIGN function_align
vpermi2q m6, m0, m2 ; in4 in8 in6 in10
vpermt2q m1, m10, m3 ; in11 in7 in9 in5
.main:
- vpbroadcastd m9, [o(pd_2048)]
- vpbroadcastq m13, [o(int_mshift)]
- kxnorb k1, k1, k1
punpcklwd m0, m4, m5 ; in0 in15 in2 in13
punpckhwd m4, m5 ; in12 in3 in14 in1
punpcklwd m5, m6, m1 ; in4 in11 in6 in9
punpckhwd m6, m1 ; in8 in7 in10 in5
+cglobal_label .main2
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ kxnorb k1, k1, k1
vpcmpub k7, m13, m9, 6 ; 0x33...
pxor m8, m8
ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5
@@ -1976,7 +2025,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
%ifidn %1_%2, dct_dct
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 8
+ or r3d, 8
.dconly:
imul r6d, 181
add r6d, 128
@@ -2114,7 +2163,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
vextracti32x4 [r3 +r4 ], m1, 3
RET
ALIGN function_align
-.main:
+cglobal_label .main
IDCT8_1D_PACKED
ret
@@ -2168,6 +2217,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
pshufd m4, m0, q1032 ; 1 0
pshufd m5, m1, q1032 ; 3 2
call .main_pass2
+ movshdup m4, [o(permC)]
pmulhrsw m0, m6
pmulhrsw m1, m6
psrlq m6, m4, 4
@@ -2194,9 +2244,8 @@ ALIGN function_align
IADST8_1D_PACKED 1
ret
ALIGN function_align
-.main_pass2:
+cglobal_label .main_pass2
IADST8_1D_PACKED 2
- movshdup m4, [o(permC)]
pxor m5, m5
psubd m5, m6
packssdw m6, m5
@@ -2222,6 +2271,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
pshufd m4, m0, q1032 ; 1 0
pshufd m5, m1, q1032 ; 3 2
call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m4, [o(permC)]
pmulhrsw m5, m6, m0
pmulhrsw m0, m6, m1
psrlq m1, m4, 12
@@ -2276,8 +2326,8 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
%ifidn %1_%2, dct_dct
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 16
imul r6d, 181
- mov r3d, 16
add r6d, 128+512
sar r6d, 8+2
jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
@@ -2456,7 +2506,7 @@ ALIGN function_align
pmulhrsw m3, m4 ; t5a t6a
jmp .main4
ALIGN function_align
-.main:
+cglobal_label .main
IDCT16_1D_PACKED
ret
@@ -2562,6 +2612,7 @@ ALIGN function_align
vshufi32x4 m1, m5, q2020 ; 2 3
vshufi32x4 m5, m7, m9, q2020 ; 10 11
vshufi32x4 m7, m9, q3131 ; 14 15
+cglobal_label .main_pass2b
REPX {pshufd x, x, q1032}, m1, m3, m5, m7
call .main
vpbroadcastd m8, [o(pw_2896x8)]
@@ -2770,13 +2821,13 @@ ALIGN function_align
vpermt2q m9, m12, m7
jmp m(idct_16x16_internal_8bpc).end
-%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
- vpbroadcastd m%3, [o(pw_%4_%5x8)]
- punpcklwd m%1, m%2, m%2
- pmulhrsw m%1, m%3
- vpbroadcastd m%3, [o(pw_%6_%7x8)]
- punpckhwd m%2, m%2
- pmulhrsw m%2, m%3
+%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4]
+ vpbroadcastd m%4, [o(pw_%5_%6x8)]
+ punpcklwd m%1, m%3, m%3
+ pmulhrsw m%1, m%4
+ vpbroadcastd m%4, [o(pw_%7_%8x8)]
+ punpckhwd m%2, m%3, m%3
+ pmulhrsw m%2, m%4
%endmacro
cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
@@ -2864,82 +2915,86 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6
vinserti32x4 ym14, ym16, xm17, 1 ; 1 3
vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7
- pxor ym4, ym4
vpermt2q m2, m5, m6 ; 8 10
vpermt2q m16, m5, m17 ; 9 11
- mova ym5, ym4
- mova ym6, ym4
- mova ym7, ym4
vextracti32x8 ym3, m2, 1 ; 12 14
vextracti32x8 ym17, m16, 1 ; 13 15
- call m(idct_8x16_internal_8bpc).main
+ call m(idct_8x16_internal_8bpc).main_fast
call .main_fast
.end:
- vpbroadcastd ym12, strided
- vpbroadcastd m13, [o(pw_2048)]
- pmulld ym7, ym12, [o(gather8d)]
- REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11
+ vpbroadcastd ym8, strided
+ pmulld ym8, [o(gather8d)]
+ call .main_end
lea r3, [dstq+strideq*4]
- shl strideq, 4
- lea r4, [dstq+strideq]
- add r1, r3
kxnorb k1, k1, k1
- pxor m6, m6
+ lea r4, [dstq+strideq*8]
+ pxor m9, m9
+ lea r1, [r3+strideq*8]
kmovb k2, k1
- vpgatherdq m12{k1}, [r0+ym7]
+ vpgatherdq m12{k1}, [r0+ym8]
kmovb k1, k2
- vpgatherdq m13{k2}, [r3+ym7]
+ vpgatherdq m13{k2}, [r3+ym8]
kmovb k2, k1
- vpgatherdq m14{k1}, [r4+ym7]
+ vpgatherdq m14{k1}, [r4+ym8]
kmovb k1, k2
- vpgatherdq m15{k2}, [r1+ym7]
- REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
- punpcklbw m4, m12, m6
- punpckhbw m12, m6
- paddw m0, m4
+ vpgatherdq m15{k2}, [r1+ym8]
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m11, m12, m9
+ punpckhbw m12, m9
+ paddw m0, m11
paddw m1, m12
packuswb m0, m1
kmovb k2, k1
- vpscatterdq [r0+ym7]{k1}, m0
- punpcklbw m4, m13, m6
- punpckhbw m13, m6
- paddw m2, m4
+ vpscatterdq [r0+ym8]{k1}, m0
+ punpcklbw m12, m13, m9
+ punpckhbw m13, m9
+ paddw m2, m12
paddw m3, m13
packuswb m2, m3
kmovb k1, k2
- vpscatterdq [r3+ym7]{k2}, m2
- punpcklbw m4, m14, m6
- punpckhbw m14, m6
- paddw m8, m4
- paddw m9, m14
- packuswb m8, m9
+ vpscatterdq [r3+ym8]{k2}, m2
+ punpcklbw m13, m14, m9
+ punpckhbw m14, m9
+ paddw m4, m13
+ paddw m5, m14
+ packuswb m4, m5
kmovb k2, k1
- vpscatterdq [r4+ym7]{k1}, m8
- punpcklbw m4, m15, m6
- punpckhbw m15, m6
- paddw m10, m4
- paddw m11, m15
- packuswb m10, m11
- vpscatterdq [r1+ym7]{k2}, m10
+ vpscatterdq [r4+ym8]{k1}, m4
+ punpcklbw m14, m15, m9
+ punpckhbw m15, m9
+ paddw m6, m14
+ paddw m7, m15
+ packuswb m6, m7
+ vpscatterdq [r1+ym8]{k2}, m6
RET
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 32
+ or r3d, 32
imul r6d, 181
add r6d, 128+512
sar r6d, 8+2
jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
INIT_YMM avx512icl
ALIGN function_align
-.main_fast: ; bottom half is zero
- ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
- ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
- ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
- ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ mova m11, m12
+ mova m17, m20
+ mova m15, m21
+ mova m16, m14
+ jmp .main4
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
jmp .main3
ALIGN function_align
-.main:
+cglobal_label .main
punpcklwd m12, m21, m14 ; in31 in1
punpckhwd m14, m21 ; in3 in29
punpcklwd m21, m20, m15 ; in27 in5
@@ -2966,6 +3021,7 @@ ALIGN function_align
paddsw m21, m16 ; t20 t27
psubsw m16, m14, m19 ; t22 t25
paddsw m14, m19 ; t23 t24
+.main4:
ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a
ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a
ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a
@@ -2997,8 +3053,8 @@ ALIGN function_align
REPX {pshufb x, m18}, m20, m11, m21, m19
ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a
ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25
- packssdw m18, m13 ; t23a t22
- packssdw m12, m15 ; t24a t25
+ packssdw m18, m13 ; t23a t22
+ packssdw m12, m15 ; t24a t25
ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a
ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27
packssdw m16, m13 ; t20 t21a
@@ -3007,32 +3063,27 @@ ALIGN function_align
punpckhqdq m19, m21 ; t28a t29
punpcklqdq m21, m20, m11 ; t16 t17a
punpckhqdq m20, m11 ; t31 t30a
- psubsw m15, m1, m19 ; out28 out29
- paddsw m1, m19 ; out3 out2
- psubsw m9, m6, m13 ; out19 out18
- paddsw m6, m13 ; out12 out13
- psubsw m10, m5, m16 ; out20 out21
- paddsw m5, m16 ; out11 out10
- psubsw m19, m3, m12 ; out24 out25
- paddsw m3, m12 ; out7 out6
- psubsw m8, m7, m21 ; out16 out17
- paddsw m7, m21 ; out15 out14
- psubsw m21, m0, m20 ; out31 out30
- paddsw m0, m20 ; out0 out1
- psubsw m11, m4, m18 ; out23 out22
- paddsw m4, m18 ; out8 out9
- psubsw m18, m2, m14 ; out27 out26
- paddsw m2, m14 ; out4 out5
INIT_ZMM avx512icl
- movu m16, [o(permD+3)]
- vpermt2q m0, m16, m4 ; 0 1 8 9
- vpermt2q m8, m16, m19 ; 16 17 24 25
- vpermt2q m1, m16, m5 ; 3 2 11 10
- vpermt2q m9, m16, m18 ; 19 18 27 26
- vpermt2q m2, m16, m6 ; 4 5 12 13
- vpermt2q m10, m16, m15 ; 20 21 28 29
- vpermt2q m3, m16, m7 ; 7 6 15 14
- vpermt2q m11, m16, m21 ; 23 22 31 30
+ mova m15, [o(permA)]
+ ret
+cglobal_label .main_end
+ vpbroadcastd m10, [o(pw_2048)]
+ vpermt2q m0, m15, m1 ; t0 t1 t2 t3
+ vpermt2q m20, m15, m19 ; t31 t30a t29 t28a
+ vpermt2q m2, m15, m3 ; t4 t5 t6 t7
+ vpermt2q m14, m15, m12 ; t27 t26a t25 t24a
+ vpermt2q m4, m15, m5 ; t8 t9 t10 t11
+ vpermt2q m18, m15, m16 ; t23a t22 t21a t20
+ vpermt2q m6, m15, m7 ; t12 t13 t14 t15
+ vpermt2q m13, m15, m21 ; t19a t18 t17a t16
+ psubsw m7, m0, m20 ; out31 out30 out29 out28
+ paddsw m0, m20 ; out0 out1 out2 out3
+ psubsw m5, m2, m14 ; out27 out26 out25 out24
+ paddsw m2, m14 ; out4 out5 out6 out7
+ psubsw m3, m4, m18 ; out23 out22 out21 out20
+ paddsw m4, m18 ; out8 out9 out10 out11
+ psubsw m1, m6, m13 ; out19 out18 out17 out16
+ paddsw m6, m13 ; out12 out13 out14 out15
vzeroupper
ret
@@ -3079,16 +3130,33 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
call m(idct_8x16_internal_8bpc).main
call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
.pass2:
- vpbroadcastd m12, [o(pw_8192)]
- vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31
- vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30
- vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29
- vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28
- vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27
- vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26
- vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15
- vshufi32x4 m0, m8, q2020 ; 0 8 16 24
- REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m10, [o(pw_8192)]
+ vpermt2q m0, m15, m4 ; t0 t1 t9 t8
+ vpermt2q m20, m15, m18 ; t31 t30a t23a t22
+ vpermt2q m3, m15, m7 ; t7 t6 t14 t15
+ vpermt2q m12, m15, m21 ; t25 t24a t17a t16
+ vpermt2q m2, m15, m6 ; t4 t5 t13 t12
+ vpermt2q m14, m15, m13 ; t23a t22 t21a t20
+ vpermt2q m1, m15, m5 ; t3 t2 t10 t11
+ vpermt2q m19, m15, m16 ; t27 t26a t19a t18
+ psubsw m8, m0, m20 ; out31 out30 out22 out23
+ paddsw m0, m20 ; out0 out1 out9 out8
+ paddsw m6, m3, m12 ; out7 out6 out14 out15
+ psubsw m3, m12 ; out24 out25 out17 out16
+ psubsw m5, m2, m14 ; out27 out26 out18 out19
+ paddsw m4, m2, m14 ; out4 out5 out13 out12
+ psubsw m7, m1, m19 ; out28 out29 out21 out20
+ paddsw m2, m1, m19 ; out3 out2 out10 out11
+ vzeroupper
+ vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25
+ vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24
+ vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27
+ vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26
+ vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29
+ vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28
+ vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31
+ vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
call .main
vpbroadcastd m8, [o(pw_2048)]
@@ -3132,7 +3200,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 8
+ or r3d, 8
.dconly2:
imul r6d, 181
add r6d, 128+512
@@ -3158,7 +3226,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
jg .dconly_loop
RET
ALIGN function_align
-.main:
+cglobal_label .main
vpbroadcastd m10, [o(pd_2048)]
.main2:
ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
@@ -3535,7 +3603,7 @@ ALIGN function_align
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
ALIGN function_align
.main_oddhalf_fast2: ; bottom three-quarters are zero
@@ -3821,8 +3889,8 @@ ALIGN function_align
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 16
imul r6d, 181
- mov r3d, 16
add r6d, 128
sar r6d, 8
imul r6d, 181
@@ -4603,7 +4671,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
ALIGN function_align
.main_oddhalf_fast2: ; bottom three-quarters are zero
@@ -5068,8 +5136,8 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 64
imul r6d, 181
- mov r3d, 64
add r6d, 128+512
sar r6d, 8+2
jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
@@ -5282,7 +5350,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
jnz .normal
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 16
+ or r3d, 16
.dconly:
imul r6d, 181
add r6d, 128+512
@@ -6012,8 +6080,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 64
imul r6d, 181
- mov r3d, 64
add r6d, 128
sar r6d, 8
imul r6d, 181
@@ -6674,8 +6742,8 @@ ALIGN function_align
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 32
imul r6d, 181
- mov r3d, 32
add r6d, 128
sar r6d, 8
imul r6d, 181
@@ -7117,7 +7185,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 64
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
ALIGN function_align
.pass2_end:
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h
new file mode 100644
index 00000000000..33c842a9ce4
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+#define decl_loopfilter_sb_fns(ext) \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext))
+
+decl_loopfilter_sb_fns(ssse3);
+decl_loopfilter_sb_fns(avx2);
+decl_loopfilter_sb_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm
index 361ccc3b883..ed83000ac24 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm
@@ -30,22 +30,24 @@
SECTION_RODATA 32
+pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
times 4 db 8, 9
times 4 db 0, 1
times 4 db 8, 9
-pw_1: times 16 dw 1
-pw_2: times 16 dw 2
-pw_3: times 16 dw 3
-; 4 and 16 need to be next to each other since they are used as alternates
-; depending on whether bitdepth is 10 or 12
-pw_4: times 16 dw 4
-pw_16: times 16 dw 16
-pw_8: times 16 dw 8
-pw_4096: times 16 dw 4096
+pw_1: times 16 dw 1
+pw_2: times 16 dw 2
+pw_3: times 16 dw 3
+pw_4096: times 2 dw 4096
-pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
+; 10bpc/12bpc:
+pw_4: times 2 dw 4
+ times 2 dw 16
+clip_max: times 2 dw 511
+ times 2 dw 2047
+clip_min: times 2 dw -512
+ times 2 dw -2048
SECTION .text
@@ -398,9 +400,10 @@ SECTION .text
pmaxuw m2, [pw_1] ; I
psrlw m1, m0, 4 ; H
paddw m0, [pw_2]
+ vpbroadcastd m8, [r11]
paddw m0, m0
paddw m0, m2 ; E
- REPX {pmullw x, [r11]}, m0, m1, m2
+ REPX {pmullw x, m8}, m0, m1, m2
psubw m8, m3, m4 ; p1-p0
psubw m9, m5, m6 ; q1-q0
@@ -430,7 +433,8 @@ SECTION .text
pabsw m10, m10
pmaxuw m9, m10
%endif
- pcmpgtw m9, [r11] ; !flat8in
+ vpbroadcastd m10, [r11]
+ pcmpgtw m9, m10 ; !flat8in
psubw m10, m13, m3 ; p2-p1
pabsw m10, m10
@@ -503,7 +507,8 @@ SECTION .text
pmaxuw m0, m2
pmaxuw m1, m10
pmaxuw m1, m0
- pcmpgtw m1, [r11] ; !flat8out
+ vpbroadcastd m0, [r11]
+ pcmpgtw m1, m0 ; !flat8out
por m1, m9 ; !flat8in | !flat8out
vpbroadcastd m2, [maskq+8]
pand m10, m2, m12
@@ -544,12 +549,8 @@ SECTION .text
%endif
; short filter
-
- vpbroadcastw m0, r7m
- pcmpeqw m2, m2
- psrlw m0, 1 ; 511 or 2047
- pxor m2, m0 ; -512 or -2048
-
+ vpbroadcastd m0, [r11+8*1] ; 511 or 2047
+ vpbroadcastd m2, [r11+8*2] ; -512 or -2048
psubw m10, m5, m4
paddw m11, m10, m10
paddw m11, m10
@@ -561,17 +562,18 @@ SECTION .text
pminsw m10, m0
pmaxsw m10, m2
pand m8, m10 ; f&=fm
- paddw m10, m8, [pw_3]
- paddw m8, [pw_4]
+ vpbroadcastd m10, [pw_4]
+ paddw m10, m8
+ paddw m8, [pw_3]
REPX {pminsw x, m0}, m10, m8
psraw m10, 3 ; f2
psraw m8, 3 ; f1
- paddw m4, m10
- psubw m5, m8
+ psubw m5, m10
+ paddw m4, m8
- paddw m8, [pw_1]
- psraw m8, 1 ; f=(f1+1)>>1
- pandn m8, m7, m8 ; f&=!hev
+ paddw m10, [pw_1]
+ psraw m10, 1 ; f=(f1+1)>>1
+ pandn m8, m7, m10 ; f&=!hev
paddw m3, m8
psubw m6, m8
pxor m8, m8
@@ -603,8 +605,8 @@ SECTION .text
mova [rsp+ 0*32], m9
; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
- psllw m8, m0, 3 ; p6*8
- paddw m8, [pw_8]
+ paddw m8, m0, [pw_1]
+ psllw m8, 3 ; p6*8+8
paddw m10, m2, m7 ; p5+p4
psubw m8, m0
paddw m10, m10 ; (p5+p4)*2
@@ -759,7 +761,6 @@ SECTION .text
psubw m8, m15
paddw m8, m0
psrlw m10, m8, 4
- pand m10, m1
%ifidn %2, v
mova m9, [tmpq+strideq*1]
%else
@@ -788,6 +789,7 @@ SECTION .text
%if %1 >= 8
; flat8 filter
+ vpbroadcastd m7, [pw_4096]
%ifidn %2, v
mova m0, [tmpq+strideq*0] ; p3
%else
@@ -799,43 +801,43 @@ SECTION .text
paddw m2, m0 ; p1+p0+p3
paddw m8, m5 ; 2*(p3+p2)+q0
paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0
- pmulhrsw m7, m2, [pw_4096]
+ pmulhrsw m10, m2, m7
paddw m8, m3, m6
psubw m2, m1
paddw m2, m8
- pmulhrsw m8, m2, [pw_4096]
+ pmulhrsw m8, m2, m7
- paddw m10, m0, m3
- paddw m11, m4, m14
- psubw m2, m10
- paddw m2, m11
- pmulhrsw m10, m2, [pw_4096]
+ paddw m11, m0, m3
+ paddw m1, m4, m14
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m1, m2, m7
paddw m11, m0, m4
+ pblendvb m4, m1, m9
paddw m1, m5, m15
psubw m2, m11
paddw m2, m1
- pmulhrsw m11, m2, [pw_4096]
+ pmulhrsw m11, m2, m7
paddw m2, m6
paddw m2, m15
paddw m1, m13, m5
+ pblendvb m5, m11, m9
+ pblendvb m13, m10, m9
psubw m2, m1
- pmulhrsw m1, m2, [pw_4096]
+ pmulhrsw m1, m2, m7
psubw m2, m3
+ pblendvb m3, m8, m9
psubw m2, m6
- paddw m0, m15, m14
- paddw m2, m0
- pmulhrsw m2, [pw_4096]
+ pblendvb m6, m1, m9
+ paddw m1, m15, m14
+ paddw m2, m1
+ pmulhrsw m2, m7
- vpblendvb m13, m13, m7, m9
- vpblendvb m3, m3, m8, m9
- vpblendvb m4, m4, m10, m9
- vpblendvb m5, m5, m11, m9
- vpblendvb m6, m6, m1, m9
- vpblendvb m14, m14, m2, m9
+ pblendvb m14, m2, m9
%ifidn %2, v
mova [tmpq+strideq*1], m13 ; p2
@@ -844,9 +846,7 @@ SECTION .text
mova [dstq+strideq*0], m5 ; q0
mova [dstq+strideq*1], m6 ; q1
mova [dstq+strideq*2], m14 ; q2
-%else
- mova m0, [rsp+5*32]
-%if %1 == 8
+%elif %1 == 8
TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1
; write 8x16
@@ -871,29 +871,28 @@ SECTION .text
vextracti128 [dstq+stride3q -8], m15, 1
lea dstq, [dstq+strideq*4]
%else
- mova m0, [rsp+6*32]
+ mova m8, [rsp+6*32]
mova m1, [rsp+7*32]
mova m2, [rsp+8*32]
mova m7, [rsp+9*32]
- mova m8, [rsp+5*32]
- TRANSPOSE8X8W 0, 1, 2, 7, 8, 13, 3, 4, 9
+ TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9
- mova [dstq+strideq*0-16], xm0
+ mova [dstq+strideq*0-16], xm8
mova [dstq+strideq*1-16], xm1
mova [dstq+strideq*2-16], xm2
mova [dstq+stride3q -16], xm7
lea tmpq, [dstq+strideq*4]
- mova [tmpq+strideq*0-16], xm8
+ mova [tmpq+strideq*0-16], xm0
mova [tmpq+strideq*1-16], xm13
mova [tmpq+strideq*2-16], xm3
mova [tmpq+stride3q -16], xm4
lea tmpq, [tmpq+strideq*4]
- vextracti128 [tmpq+strideq*0-16], m0, 1
+ vextracti128 [tmpq+strideq*0-16], m8, 1
vextracti128 [tmpq+strideq*1-16], m1, 1
vextracti128 [tmpq+strideq*2-16], m2, 1
vextracti128 [tmpq+stride3q -16], m7, 1
lea tmpq, [tmpq+strideq*4]
- vextracti128 [tmpq+strideq*0-16], m8, 1
+ vextracti128 [tmpq+strideq*0-16], m0, 1
vextracti128 [tmpq+strideq*1-16], m13, 1
vextracti128 [tmpq+strideq*2-16], m3, 1
vextracti128 [tmpq+stride3q -16], m4, 1
@@ -924,39 +923,38 @@ SECTION .text
vextracti128 [dstq+stride3q ], m3, 1
lea dstq, [dstq+strideq*4]
%endif
-%endif
%elif %1 == 6
; flat6 filter
-
+ vpbroadcastd m7, [pw_4096]
paddw m8, m3, m4
paddw m8, m13 ; p2+p1+p0
paddw m11, m13, m5
paddw m8, m8
paddw m8, m11 ; p2+2*(p2+p1+p0)+q0
- pmulhrsw m2, m8, [pw_4096]
+ pmulhrsw m2, m8, m7
paddw m8, m5
paddw m11, m13, m13
paddw m8, m6
psubw m8, m11
- pmulhrsw m10, m8, [pw_4096]
+ pmulhrsw m10, m8, m7
paddw m8, m6
paddw m11, m13, m3
paddw m8, m14
psubw m8, m11
- pmulhrsw m11, m8, [pw_4096]
+ pmulhrsw m11, m8, m7
psubw m8, m3
paddw m14, m14
psubw m8, m4
paddw m8, m14
- pmulhrsw m8, [pw_4096]
+ pmulhrsw m8, m7
- vpblendvb m3, m3, m2, m9
- vpblendvb m4, m4, m10, m9
- vpblendvb m5, m5, m11, m9
- vpblendvb m6, m6, m8, m9
+ pblendvb m3, m2, m9
+ pblendvb m4, m10, m9
+ pblendvb m5, m11, m9
+ pblendvb m6, m8, m9
%ifidn %2, v
mova [tmpq+strideq*2], m3 ; p1
@@ -982,10 +980,10 @@ INIT_YMM avx2
cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
dst, stride, mask, l, l_stride, lut, \
w, stride3, mstride, tmp, mask_bits
- rorx r6d, r7m, 6
- and r6d, 32 ; 0 for 10bpc, 32 for 12bpc
+ mov r6d, r7m
lea r11, [pw_4]
- add r11, r6
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
mov wd, wm
shl l_strideq, 2
sub lq, l_strideq
@@ -1013,7 +1011,7 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
test [maskq+0], mask_bitsd ; vmask[0]
jz .end
- FILTER 4, v
+ call .v4
.end:
pslld m12, 4
@@ -1023,15 +1021,19 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
sub wd, 4
jg .loop
RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
INIT_YMM avx2
cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
dst, stride, mask, l, l_stride, lut, \
h, stride3, l_stride3, tmp, mask_bits
- rorx r6d, r7m, 6
- and r6d, 32 ; 0 for 10bpc, 32 for 12bpc
+ mov r6d, r7m
lea r11, [pw_4]
- add r11, r6
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
mov hd, hm
shl l_strideq, 2
sub lq, 4
@@ -1058,7 +1060,7 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
test [maskq+0], mask_bitsd ; vmask[0]
jz .no_filter
- FILTER 4, h
+ call .h4
jmp .end
.no_filter:
@@ -1071,15 +1073,19 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
sub hd, 4
jg .loop
RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
INIT_YMM avx2
cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
dst, stride, mask, l, l_stride, lut, \
w, stride3, mstride, tmp, mask_bits
- rorx r6d, r7m, 6
- and r6d, 32 ; 0 for 10bpc, 32 for 12bpc
+ mov r6d, r7m
lea r11, [pw_4]
- add r11, r6
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
mov wd, wm
shl l_strideq, 2
sub lq, l_strideq
@@ -1100,7 +1106,7 @@ cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
test [maskq+0], mask_bitsd ; vmask[0]
jz .end
- FILTER 4, v
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4
.end:
pslld m12, 4
@@ -1115,10 +1121,10 @@ INIT_YMM avx2
cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \
dst, stride, mask, l, l_stride, lut, \
h, stride3, l_stride3, tmp, mask_bits
- rorx r6d, r7m, 6
- and r6d, 32 ; 0 for 10bpc, 32 for 12bpc
+ mov r6d, r7m
lea r11, [pw_4]
- add r11, r6
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
mov hd, hm
shl l_strideq, 2
sub lq, 4
@@ -1138,7 +1144,7 @@ cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \
test [maskq+0], mask_bitsd ; vmask[0]
jz .no_filter
- FILTER 4, h
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4
jmp .end
.no_filter:
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm
new file mode 100644
index 00000000000..b7bc3aa106f
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm
@@ -0,0 +1,912 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+l_shuf_v: times 2 db 0, 32
+pw_1: times 2 dw 1
+ times 2 db 4, 36
+pw_3: times 2 dw 3
+ times 2 db 8, 40
+pw_4: times 2 dw 4
+ times 2 db 12, 44
+pw_16: times 2 dw 16
+ times 2 db 16, 48
+pw_4096: times 2 dw 4096
+ times 2 db 20, 52
+pw_16384: times 2 dw 16384
+ times 2 db 24, 56
+pw_32767: times 2 dw 32767
+ times 2 db 28, 60
+ times 2 dw 0
+filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128
+stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25
+l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1
+clip_max: dw 511, 511, 2047, 2047
+clip_min: dw -512, -512, -2048, -2048
+
+SECTION .text
+
+%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp
+ punpckhwd m%9, m%5, m%6
+ punpcklwd m%5, m%6
+ punpckhwd m%6, m%1, m%2
+ punpcklwd m%1, m%2
+ punpckhwd m%2, m%7, m%8
+ punpcklwd m%7, m%8
+ punpckhwd m%8, m%3, m%4
+ punpcklwd m%3, m%4
+ punpckhdq m%4, m%1, m%3
+ punpckldq m%1, m%3
+ punpckldq m%3, m%5, m%7
+ punpckhdq m%5, m%7
+ punpckhdq m%7, m%6, m%8
+ punpckldq m%6, m%8
+ punpckldq m%8, m%9, m%2
+ punpckhdq m%9, m%2
+ punpckhqdq m%2, m%1, m%3
+ punpcklqdq m%1, m%3
+ punpcklqdq m%3, m%4, m%5
+ punpckhqdq m%4, m%5
+ punpcklqdq m%5, m%6, m%8
+ punpckhqdq m%6, m%8
+ punpckhqdq m%8, m%7, m%9
+ punpcklqdq m%7, m%9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%ifidn %2, v
+%if %1 == 16
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1 ]
+ mova m1, [tmpq+strideq*2 ] ; p5
+ mova m2, [tmpq+stride3q ] ; p4
+ mova m3, [tmpq+strideq*4 ] ; p3
+ mova m4, [tmpq+stride5q ] ; p2
+%elif %1 == 6 || %1 == 8
+ lea tmpq, [dstq+mstrideq*4]
+%if %1 == 8
+ mova m3, [tmpq+strideq*0 ]
+%endif
+ mova m4, [tmpq+strideq*1 ]
+%endif
+ mova m5, [dstq+mstrideq*2] ; p1
+ mova m6, [dstq+mstrideq*1] ; p0
+ mova m7, [dstq+strideq*0 ] ; q0
+ mova m8, [dstq+strideq*1 ] ; q1
+%if %1 != 4
+ mova m9, [dstq+strideq*2 ] ; q2
+%endif
+%if %1 == 8 || %1 == 16
+ mova m10, [dstq+stride3q ] ; q3
+%endif
+%if %1 == 16
+ mova m11, [dstq+strideq*4 ] ; q4
+ mova m22, [dstq+stride5q ] ; q5
+ mova m23, [dstq+stride3q*2]
+%endif
+%else ; h
+%if %1 == 16
+ movu ym16, [dstq+strideq*0 -16]
+ movu ym17, [dstq+strideq*1 -16]
+ movu ym18, [dstq+strideq*2 -16]
+ movu ym19, [dstq+stride3q -16]
+ movu ym20, [dstq+strideq*4 -16]
+ movu ym22, [dstq+stride5q -16]
+ movu ym23, [dstq+stride3q*2-16]
+ movu ym28, [dstq+stride7q -16]
+ lea tmpq, [dstq+strideq*8 -16]
+ vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m10, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1
+ vinserti32x8 m22, m22, [tmpq+stride5q ], 1
+ vinserti32x8 m23, m23, [tmpq+stride3q*2], 1
+ vinserti32x8 m28, m28, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8]
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27
+ movu ym16, [tmpq+strideq*0 ]
+ movu ym17, [tmpq+strideq*1 ]
+ movu ym18, [tmpq+strideq*2 ]
+ movu ym19, [tmpq+stride3q ]
+ movu ym24, [tmpq+strideq*4 ]
+ movu ym25, [tmpq+stride5q ]
+ movu ym26, [tmpq+stride3q*2]
+ movu ym20, [tmpq+stride7q ]
+ lea tmpq, [tmpq+strideq*8]
+ vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m3, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1
+ vinserti32x8 m5, m25, [tmpq+stride5q ], 1
+ vinserti32x8 m6, m26, [tmpq+stride3q*2], 1
+ vinserti32x8 m20, m20, [tmpq+stride7q ], 1
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27
+ vshufi32x4 m27, m7, m0, q2020
+ vshufi32x4 m7, m0, q3131
+ vshufi32x4 m0, m8, m1, q2020
+ vshufi32x4 m8, m1, q3131
+ vshufi32x4 m1, m9, m2, q2020
+ vshufi32x4 m9, m2, q3131
+ vshufi32x4 m2, m10, m3, q2020
+ vshufi32x4 m10, m3, q3131
+ vshufi32x4 m3, m11, m4, q2020
+ vshufi32x4 m11, m4, q3131
+ vshufi32x4 m4, m22, m5, q2020
+ vshufi32x4 m22, m5, q3131
+ vshufi32x4 m5, m23, m6, q2020
+ vshufi32x4 m23, m6, q3131
+ vshufi32x4 m6, m28, m20, q2020
+ vshufi32x4 m28, m20, q3131
+%elif %1 == 6 || %1 == 8
+%if %1 == 8
+ sub dstq, 8
+ movu xm16, [dstq+strideq*0 ]
+ movu xm17, [dstq+strideq*1 ]
+ movu xm18, [dstq+strideq*2 ]
+ movu xm19, [dstq+stride3q ]
+ movu xm24, [dstq+strideq*4 ]
+ movu xm25, [dstq+stride5q ]
+ movu xm26, [dstq+stride3q*2]
+ movu xm27, [dstq+stride7q ]
+ lea tmpq, [dstq+strideq*8 ]
+ vinserti128 ym16, [tmpq+strideq*0 ], 1
+ vinserti128 ym17, [tmpq+strideq*1 ], 1
+ vinserti128 ym18, [tmpq+strideq*2 ], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ vinserti128 ym24, [tmpq+strideq*4 ], 1
+ vinserti128 ym25, [tmpq+stride5q ], 1
+ vinserti128 ym26, [tmpq+stride3q*2], 1
+ vinserti128 ym27, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2
+ vinserti32x4 m9, m25, [tmpq+stride5q ], 2
+ vinserti32x4 m3, m26, [tmpq+stride3q*2], 2
+ vinserti32x4 m4, m27, [tmpq+stride7q ], 2
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, [tmpq+strideq*0 ], 3
+ vinserti32x4 m8, [tmpq+strideq*1 ], 3
+ vinserti32x4 m5, [tmpq+strideq*2 ], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ vinserti32x4 m2, [tmpq+strideq*4 ], 3
+ vinserti32x4 m9, [tmpq+stride5q ], 3
+ vinserti32x4 m3, [tmpq+stride3q*2], 3
+ vinserti32x4 m4, [tmpq+stride7q ], 3
+%else ; %1 == 6
+ movu xm16, [dstq+strideq*0-8]
+ movu xm17, [dstq+strideq*1-8]
+ movu xm18, [dstq+strideq*2-8]
+ movu xm19, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4-8]
+ movu xm2, [tmpq+strideq*0]
+ movu xm9, [tmpq+strideq*1]
+ movu xm3, [tmpq+strideq*2]
+ movu xm4, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym16, [tmpq+strideq*0], 1
+ vinserti128 ym17, [tmpq+strideq*1], 1
+ vinserti128 ym18, [tmpq+strideq*2], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym2, [tmpq+strideq*0], 1
+ vinserti128 ym9, [tmpq+strideq*1], 1
+ vinserti128 ym3, [tmpq+strideq*2], 1
+ vinserti128 ym4, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, m16, [tmpq+strideq*0], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 2
+ vinserti32x4 m9, [tmpq+strideq*1], 2
+ vinserti32x4 m3, [tmpq+strideq*2], 2
+ vinserti32x4 m4, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, [tmpq+strideq*0], 3
+ vinserti32x4 m8, [tmpq+strideq*1], 3
+ vinserti32x4 m5, [tmpq+strideq*2], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 3
+ vinserti32x4 m9, [tmpq+strideq*1], 3
+ vinserti32x4 m3, [tmpq+strideq*2], 3
+ vinserti32x4 m4, [tmpq+stride3q ], 3
+%endif
+ punpcklwd m6, m10, m8
+ punpckhwd m10, m8
+ punpcklwd m8, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m2, m9
+ punpckhwd m2, m9
+ punpcklwd m9, m3, m4
+ punpckhwd m3, m4
+ punpckldq m4, m6, m8
+ punpckhdq m6, m8
+ punpckldq m8, m10, m5
+ punpckhdq m10, m5
+ punpckldq m5, m7, m9
+ punpckhdq m7, m9
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+%if %1 == 8
+ punpcklqdq m3, m4, m5
+%endif
+ punpckhqdq m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m9
+ punpckhqdq m8, m9
+ punpcklqdq m9, m10, m2
+%if %1 == 8
+ punpckhqdq m10, m2
+%endif
+%else ; %1 == 4
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdq m7{k1}, [dstq+ym12-4]
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpgatherdq m4{k2}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpgatherdq m5{k1}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ vpgatherdq m6{k2}, [tmpq+ym12]
+ punpcklwd m8, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m8, m7
+ punpckhwd m8, m7
+ punpcklwd m7, m4, m5
+ punpckhwd m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m4
+ punpckhqdq m8, m4
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ movu ym16, [lq+l_strideq*1]
+ movsldup m17, [l_shuf_v]
+ vptestnmb k1, ym16, ym16
+ vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][]
+ vpermb m16, m17, m16 ; l[x][1]
+%else
+ movq xm16, [lq+l_strideq*0]
+ movq xm17, [lq+l_strideq*1]
+ vinserti128 ym16, [lq+l_strideq*2], 1
+ vinserti128 ym17, [lq+l_stride3q ], 1
+ lea tmpq, [lq+l_strideq*4]
+ vinserti32x4 m16, [tmpq+l_strideq*0], 2
+ vinserti32x4 m17, [tmpq+l_strideq*1], 2
+ vinserti32x4 m16, [tmpq+l_strideq*2], 3
+ vinserti32x4 m17, [tmpq+l_stride3q ], 3
+ punpcklqdq m16, m17
+ vbroadcasti32x4 m17, [l_shuf_h]
+ vptestnmb k1, m16, m16
+ vpalignr m16{k1}, m16, 12
+ pshufb m16, m17 ; l[x][1]
+%endif
+ vpbroadcastd m20, [pw_32767]
+ psubw m17, m5, m6 ; p1-p0
+ psubw m18, m7, m8 ; q1-q0
+ vptestmw k1, m16, m16 ; L
+ pabsw m17, m17
+ pabsw m18, m18
+ vpmaxuw m20{k1}, m17, m18
+ vpbroadcastw m17, [lutq+136]
+ psrlw m18, m16, [lutq+128]
+ vpbroadcastd m19, [pw_1]
+ pminuw m18, m17
+ psrlw m17, m16, 4 ; H
+ paddw m16, m16
+ pmaxuw m18, m19 ; I
+ vpaddd m16, [pw_4] {1to16}
+ paddw m16, m18 ; E
+ REPX {pmullw x, m13}, m17, m18, m16
+ vpcmpw k4, m20, m17, 6 ; hev
+%if %1 != 4
+ psubw m19, m4, m5 ; p2-p1
+ pabsw m19, m19
+%if %1 == 8 || %1 == 16
+ psubw m17, m3, m4 ; p3-p2
+ pabsw m17, m17
+ pmaxuw m19, m17
+ psubw m17, m9, m10 ; q3-q2
+ pabsw m17, m17
+ pmaxuw m19, m17
+%endif
+ psubw m17, m9, m8 ; q2-q1
+ pabsw m17, m17
+ pmaxuw m19, m17
+%if %1 == 16
+ vpbroadcastd ym17, [maskq+4]
+ vpord ym17, [maskq+8] {1to8}
+ vptestmd k1, ym17, ym21
+%else
+ vptestmd k1, ym21, [maskq+4] {1to8}
+%endif
+ pmaxuw m19, m20
+ psubw m17, m4, m6 ; p2-p0
+ pabsw m17, m17
+ pmaxuw m17, m20
+ vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks
+%if %1 == 8 || %1 == 16
+ psubw m19, m3, m6 ; p3-p0
+ pabsw m19, m19
+ pmaxuw m17, m19
+ psubw m19, m7, m10 ; q3-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ psubw m19, m7, m9 ; q2-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ vpcmpw k1, m20, m18, 2
+ psubw m18, m5, m8 ; p1-q1
+ psubw m19, m6, m7 ; p0-q0
+ pabsw m18, m18
+ pabsw m19, m19
+ psrlw m18, 1
+ paddw m19, m19
+ paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+%if %1 != 4
+ vpcmpw k2{k1}, m17, m13, 2 ; flat8in
+%endif
+%if %1 == 16
+ psubw m20, m0, m6
+ psubw m16, m1, m6
+ pabsw m20, m20
+ psubw m17, m2, m6
+ pabsw m16, m16
+ psubw m18, m11, m7
+ pabsw m17, m17
+ psubw m19, m22, m7
+ pabsw m18, m18
+ pmaxuw m20, m16
+ psubw m16, m23, m7
+ pabsw m19, m19
+ pmaxuw m17, m18
+ pabsw m16, m16
+ vpandd ym18, ym21, [maskq+8] {1to8}
+ pmaxuw m20, m17
+ pmaxuw m19, m16
+ pcmpeqd ym16, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8
+ pmaxuw m20, m19
+ pcmpeqd ym17, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8
+ vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out
+ pcmpeqd ym18, ym21
+ vptestmb k3{k3}, ym16, ym16 ; flat8 & fm
+ vptestmb k2{k2}, ym17, ym17 ; flat8in
+ vptestmb k1{k1}, ym18, ym18
+ kandnd k1, k2, k1 ; fm & !flat8 & !flat16
+ kandnd k2, k3, k2 ; flat8 & !flat16
+%elif %1 == 6 || %1 == 8
+ vpandd ym17, ym21, [maskq+4] {1to8}
+ pcmpeqd ym16, ym21, ym17
+ vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8
+ pcmpeqd ym17, ym21
+ vptestmb k2{k2}, ym16, ym16 ; flat8 & fm
+ vptestmb k1{k1}, ym17, ym17
+ kandnd k1, k2, k1 ; fm & !flat8
+%else ; %1 == 4
+ vpandd ym16, ym21, [maskq+0] {1to8}
+ pcmpeqd ym16, ym21
+ vptestmb k1{k1}, ym16, ym16
+%endif
+
+ ; short filter
+ psubw m16, m7, m6
+ vpbroadcastd m17, [pw_3]
+ paddw m18, m16, m16
+ paddw m18, m16
+ psubw m16, m5, m8 ; iclip_diff(p1-q1)
+ pminsw m16, m14
+ vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev
+ knotd k4, k4 ; !hev
+ paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f)
+ vpbroadcastd m18, [pw_4]
+ pminsw m16, m14
+ vpmaxsw m16{k1}{z}, m15 ; f&=fm
+ paddw m17, m16
+ paddw m16, m18
+ vpbroadcastd m18, [pw_16384]
+ pminsw m17, m14
+ pminsw m16, m14
+ psraw m17, 3 ; f2
+ psraw m16, 3 ; f1
+ paddw m6, m17
+ psubw m7, m16
+ vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev
+ psubw m17, m14, m15 ; 1023 or 4095
+ pxor m18, m18
+ paddw m5, m16
+ psubw m8, m16
+ REPX {pminsw x, m17}, m6, m7, m5, m8
+ REPX {pmaxsw x, m18}, m6, m7, m5, m8
+
+%if %1 == 16 ; flat16 filter
+ vpaddd m19, m0, [pw_1] {1to16}
+ paddw m16, m1, m2 ; p5+p4
+ paddw m26, m1, m6 ; p5+p0
+ paddw m24, m2, m7 ; p4+q0
+ paddw m16, m4 ; p5+p4+p3
+ paddw m17, m3, m5 ; p2+p1
+ psllw m19, 3
+ paddw m16, m26 ; p5*2+p4+p3+p0
+ paddw m17, m24 ; p4+p2+p1+q0
+ psubw m19, m0 ; p6*7+8
+ paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0
+ paddw m18, m3, m8
+ paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0
+ paddw m25, m1, m0
+ paddw m16, m0, m0
+ psrlw m1{k3}, m19, 4
+ paddw m19, m18
+ psubw m19, m16 ; +p3+q1-p6*2
+ paddw m16, m2, m0
+ psrlw m2{k3}, m19, 4
+ psubw m19, m25
+ paddw m25, m4, m9
+ paddw m20, m10, m5
+ paddw m19, m25 ; +p2+q2-p6-p5
+ paddw m17, m0, m3
+ psubw m16, m20, m16
+ psrlw m3{k3}, m19, 4
+ paddw m19, m16 ; +p1+q3-p6-p4
+ paddw m16, m11, m6
+ psubw m16, m17
+ paddw m17, m0, m4
+ psrlw m4{k3}, m19, 4
+ paddw m19, m16 ; +p0+q4-p6-p3
+ paddw m16, m22, m7
+ psubw m16, m17
+ paddw m17, m0, m5
+ psrlw m5{k3}, m19, 4
+ paddw m19, m16 ; +q0+q5-p6-p2
+ paddw m16, m23, m8
+ psrlw m6{k3}, m19, 4
+ psubw m16, m17
+ paddw m19, m16 ; +q1+q6-p6-p1
+ paddw m16, m23, m9
+ psrlw m7{k3}, m19, 4
+ psubw m16, m26
+ paddw m19, m16 ; +q2+q6-p5-p0
+ paddw m16, m23, m10
+ psrlw m8{k3}, m19, 4
+ psubw m16, m24
+ paddw m19, m16 ; +q3+q6-p4-p0
+ paddw m16, m23, m11
+ psrlw m9{k3}, m19, 4
+ psubw m16, m18
+ paddw m19, m16 ; +q4+q6-p3-q1
+ paddw m16, m23, m22
+ psrlw m10{k3}, m19, 4
+ psubw m16, m25
+ paddw m19, m16 ; +q5+q6-p2-q2
+ paddw m16, m23, m23
+ psrlw m11{k3}, m19, 4
+ psubw m16, m20
+ paddw m19, m16 ; +q6*2-p1-q3
+ psrlw m22{k3}, m19, 4
+%endif
+%if %1 == 8 || %1 == 16 ; flat8 filter
+ vpbroadcastd m20, [pw_4096]
+ paddw m16, m3, m4 ; p3+p2
+ paddw m19, m5, m6 ; p1+p0
+ paddw m17, m16, m16 ; 2*(p3+p2)
+ paddw m19, m3 ; p1+p0+p3
+ paddw m17, m7 ; 2*(p3+p2)+q0
+ paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0
+ paddw m18, m4, m7
+ pmulhrsw m4{k2}, m19, m20
+ psubw m19, m16
+ paddw m17, m5, m8
+ paddw m16, m3, m5
+ paddw m19, m17
+ pmulhrsw m5{k2}, m19, m20
+ psubw m19, m16
+ paddw m16, m6, m9
+ paddw m19, m16
+ paddw m16, m3, m6
+ pmulhrsw m6{k2}, m19, m20
+ paddw m19, m10
+ psubw m16, m7, m16
+ paddw m19, m16
+ psubw m16, m10, m18
+ pmulhrsw m7{k2}, m19, m20
+ paddw m16, m8
+ paddw m19, m16
+ psubw m16, m10, m17
+ pmulhrsw m8{k2}, m19, m20
+ paddw m16, m9
+ paddw m19, m16
+ pmulhrsw m9{k2}, m19, m20
+%elif %1 == 6 ; flat6 filter
+ vpbroadcastd m10, [pw_4096]
+ paddw m2, m5, m6
+ paddw m0, m4, m7
+ paddw m1, m2, m4 ; p2+p1+p0
+ paddw m3, m4, m4
+ paddw m1, m1
+ paddw m4, m5
+ paddw m1, m0 ; p2+2*(p2+p1+p0)+q0
+ psubw m3, m7, m3
+ pmulhrsw m5{k2}, m1, m10
+ paddw m3, m8
+ psubw m4, m8, m4
+ paddw m1, m3
+ pmulhrsw m6{k2}, m1, m10
+ paddw m4, m9
+ paddw m9, m9
+ paddw m1, m4
+ pmulhrsw m7{k2}, m1, m10
+ psubw m9, m2
+ paddw m1, m9
+ pmulhrsw m8{k2}, m1, m10
+%endif
+
+%ifidn %2, v
+%if %1 == 16
+ mova [tmpq+strideq*2 ], m1 ; p5
+ mova [tmpq+stride3q ], m2 ; p4
+ mova [tmpq+strideq*4 ], m3 ; p3
+ mova [tmpq+stride5q ], m4 ; p2
+%elif %1 == 8
+ mova [tmpq+strideq*1 ], m4 ; p2
+%endif
+ mova [dstq+mstrideq*2], m5 ; p1
+ mova [dstq+mstrideq ], m6 ; p0
+ mova [dstq+strideq*0 ], m7 ; q0
+ mova [dstq+strideq*1 ], m8 ; q1
+%if %1 == 8 || %1 == 16
+ mova [dstq+strideq*2 ], m9 ; q2
+%endif
+%if %1 == 16
+ mova [dstq+stride3q ], m10 ; q3
+ mova [dstq+strideq*4 ], m11 ; q4
+ mova [dstq+stride5q ], m22 ; q5
+%endif
+%else
+%if %1 == 16
+ TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20
+ mova [dstq+strideq*0 -16], xm27
+ mova [dstq+strideq*0 ], xm7
+ mova [dstq+strideq*1 -16], xm0
+ mova [dstq+strideq*1 ], xm8
+ mova [dstq+strideq*2 -16], xm1
+ mova [dstq+strideq*2 ], xm9
+ mova [dstq+stride3q -16], xm2
+ mova [dstq+stride3q ], xm10
+ mova [dstq+strideq*4 -16], xm3
+ mova [dstq+strideq*4 ], xm11
+ mova [dstq+stride5q -16], xm4
+ mova [dstq+stride5q ], xm22
+ mova [dstq+stride3q*2-16], xm5
+ mova [dstq+stride3q*2 ], xm23
+ mova [dstq+stride7q -16], xm6
+ mova [dstq+stride7q ], xm28
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 -16], ym27, 1
+ vextracti128 [dstq+strideq*0 ], ym7, 1
+ vextracti128 [dstq+strideq*1 -16], ym0, 1
+ vextracti128 [dstq+strideq*1 ], ym8, 1
+ vextracti128 [dstq+strideq*2 -16], ym1, 1
+ vextracti128 [dstq+strideq*2 ], ym9, 1
+ vextracti128 [dstq+stride3q -16], ym2, 1
+ vextracti128 [dstq+stride3q ], ym10, 1
+ vextracti128 [dstq+strideq*4 -16], ym3, 1
+ vextracti128 [dstq+strideq*4 ], ym11, 1
+ vextracti128 [dstq+stride5q -16], ym4, 1
+ vextracti128 [dstq+stride5q ], ym22, 1
+ vextracti128 [dstq+stride3q*2-16], ym5, 1
+ vextracti128 [dstq+stride3q*2 ], ym23, 1
+ vextracti128 [dstq+stride7q -16], ym6, 1
+ vextracti128 [dstq+stride7q ], ym28, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 2
+ vextracti32x4 [dstq+strideq*0 ], m7, 2
+ vextracti32x4 [dstq+strideq*1 -16], m0, 2
+ vextracti32x4 [dstq+strideq*1 ], m8, 2
+ vextracti32x4 [dstq+strideq*2 -16], m1, 2
+ vextracti32x4 [dstq+strideq*2 ], m9, 2
+ vextracti32x4 [dstq+stride3q -16], m2, 2
+ vextracti32x4 [dstq+stride3q ], m10, 2
+ vextracti32x4 [dstq+strideq*4 -16], m3, 2
+ vextracti32x4 [dstq+strideq*4 ], m11, 2
+ vextracti32x4 [dstq+stride5q -16], m4, 2
+ vextracti32x4 [dstq+stride5q ], m22, 2
+ vextracti32x4 [dstq+stride3q*2-16], m5, 2
+ vextracti32x4 [dstq+stride3q*2 ], m23, 2
+ vextracti32x4 [dstq+stride7q -16], m6, 2
+ vextracti32x4 [dstq+stride7q ], m28, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 3
+ vextracti32x4 [dstq+strideq*0 ], m7, 3
+ vextracti32x4 [dstq+strideq*1 -16], m0, 3
+ vextracti32x4 [dstq+strideq*1 ], m8, 3
+ vextracti32x4 [dstq+strideq*2 -16], m1, 3
+ vextracti32x4 [dstq+strideq*2 ], m9, 3
+ vextracti32x4 [dstq+stride3q -16], m2, 3
+ vextracti32x4 [dstq+stride3q ], m10, 3
+ vextracti32x4 [dstq+strideq*4 -16], m3, 3
+ vextracti32x4 [dstq+strideq*4 ], m11, 3
+ vextracti32x4 [dstq+stride5q -16], m4, 3
+ vextracti32x4 [dstq+stride5q ], m22, 3
+ vextracti32x4 [dstq+stride3q*2-16], m5, 3
+ vextracti32x4 [dstq+stride3q*2 ], m23, 3
+ vextracti32x4 [dstq+stride7q -16], m6, 3
+ vextracti32x4 [dstq+stride7q ], m28, 3
+%elif %1 == 8
+ TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2
+ movu [dstq+strideq*0 ], xm3
+ movu [dstq+strideq*1 ], xm4
+ movu [dstq+strideq*2 ], xm5
+ movu [dstq+stride3q ], xm6
+ movu [dstq+strideq*4 ], xm7
+ movu [dstq+stride5q ], xm8
+ movu [dstq+stride3q*2], xm9
+ movu [dstq+stride7q ], xm10
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 ], ym3, 1
+ vextracti128 [dstq+strideq*1 ], ym4, 1
+ vextracti128 [dstq+strideq*2 ], ym5, 1
+ vextracti128 [dstq+stride3q ], ym6, 1
+ vextracti128 [dstq+strideq*4 ], ym7, 1
+ vextracti128 [dstq+stride5q ], ym8, 1
+ vextracti128 [dstq+stride3q*2], ym9, 1
+ vextracti128 [dstq+stride7q ], ym10, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 2
+ vextracti32x4 [dstq+strideq*1 ], m4, 2
+ vextracti32x4 [dstq+strideq*2 ], m5, 2
+ vextracti32x4 [dstq+stride3q ], m6, 2
+ vextracti32x4 [dstq+strideq*4 ], m7, 2
+ vextracti32x4 [dstq+stride5q ], m8, 2
+ vextracti32x4 [dstq+stride3q*2], m9, 2
+ vextracti32x4 [dstq+stride7q ], m10, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 3
+ vextracti32x4 [dstq+strideq*1 ], m4, 3
+ vextracti32x4 [dstq+strideq*2 ], m5, 3
+ vextracti32x4 [dstq+stride3q ], m6, 3
+ vextracti32x4 [dstq+strideq*4 ], m7, 3
+ vextracti32x4 [dstq+stride5q ], m8, 3
+ vextracti32x4 [dstq+stride3q*2], m9, 3
+ vextracti32x4 [dstq+stride7q ], m10, 3
+ lea dstq, [dstq+strideq*8+8]
+%else ; %1 == 4 || %1 == 6
+ punpcklwd m9, m5, m6
+ punpckhwd m5, m6
+ kxnorb k1, k1, k1
+ punpcklwd m6, m7, m8
+ punpckhwd m7, m8
+ kmovb k2, k1
+ punpckldq m8, m9, m6
+ vpscatterdq [dstq+ym12-4]{k1}, m8
+ punpckhdq m9, m6
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpscatterdq [tmpq+ym12]{k2}, m9
+ punpckldq m6, m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpscatterdq [tmpq+ym12]{k1}, m6
+ punpckhdq m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ vpscatterdq [tmpq+ym12]{k2}, m5
+%endif
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride, tmp, \
+ mask_bits, stride5
+%define base tmpq-filter_mask
+ SWAP 12, 26 ; avoids clobbering xmm10 on WIN64
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, v
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call .v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.v4: ; called by both luma and chroma
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, l_stride3, tmp, \
+ mask_bits, stride5, stride7
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride7q, [strideq+stride3q*2]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, h
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, h
+ jmp .end2
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+ call .h4
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+.end:
+ lea dstq, [dstq+strideq*8]
+.end2:
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.h4: ; called by both luma and chroma
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ shl l_strideq, 2
+ lea stride3q, [strideq*3]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride3q, [strideq*3]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, h
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4
+.end:
+ lea tmpq, [strideq+stride3q]
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea dstq, [dstq+tmpq*8]
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm
index d6b296b19ef..84696c758ae 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm
@@ -1444,7 +1444,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
cmp byte [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call .v4
.end:
add lq, 32
@@ -1453,6 +1453,10 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
sub wd, 8
jg .loop
RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
INIT_YMM avx2
cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
@@ -1481,7 +1485,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
cmp byte [maskq+0], 0 ; vmask[0]
je .no_filter
- FILTER 4, h
+ call .h4
jmp .end
.no_filter:
@@ -1493,6 +1497,10 @@ cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
sub hd, 8
jg .loop
RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
INIT_YMM avx2
cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
@@ -1515,7 +1523,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
cmp byte [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4
.end:
add lq, 32
@@ -1545,7 +1553,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \
cmp byte [maskq+0], 0 ; vmask[0]
je .no_filter
- FILTER 4, h
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4
jmp .end
.no_filter:
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
index c09dced418b..0218b624d3c 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
@@ -80,25 +80,24 @@ SECTION .text
punpckhwd m%1, m%3
kmovw k1, k6
lea t0, [dstq+strideq*4]
- vpscatterdd [dstq+m29-2]{k1}, m%4
+ vpscatterdd [dstq+m19-2]{k1}, m%4
kmovw k1, k6
lea t1, [dstq+strideq*8]
- vpscatterdd [t0 +m29-2]{k1}, m%5
+ vpscatterdd [t0 +m19-2]{k1}, m%5
kmovw k1, k6
lea t2, [t0 +strideq*8]
- vpscatterdd [t1 +m29-2]{k1}, m%2
+ vpscatterdd [t1 +m19-2]{k1}, m%2
kmovw k1, k6
- vpscatterdd [t2 +m29-2]{k1}, m%1
+ vpscatterdd [t2 +m19-2]{k1}, m%1
%endmacro
%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
%if %1 == 0
- SWAP m16, m15
+ SWAP m16, m22
%endif
- ; input in m0-15
- punpcklbw m15, m0, m1
- punpckhbw m0, m1
- punpcklbw m1, m2, m3
+ punpcklbw m22, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m2, m3
punpckhbw m2, m3
punpcklbw m3, m4, m5
punpckhbw m4, m5
@@ -108,21 +107,21 @@ SECTION .text
punpckhbw m8, m9
punpcklbw m9, m10, m11
punpckhbw m10, m11
- punpcklbw m11, m12, m13
- punpckhbw m12, m13
+ punpcklbw m11, m25, m13
+ punpckhbw m25, m13
%if %1 == 0
SWAP m13, m16
%else
mova m13, %3
%endif
- SWAP m16, m12
- punpcklbw m12, m14, m13
+ SWAP m16, m25
+ punpcklbw m25, m14, m13
punpckhbw m13, m14, m13
- ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
- punpcklwd m14, m15, m1
- punpckhwd m15, m1
- punpcklwd m1, m0, m2
- punpckhwd m0, m2
+ ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13
+ punpcklwd m14, m22, m26
+ punpckhwd m22, m26
+ punpcklwd m26, m24, m2
+ punpckhwd m24, m2
punpcklwd m2, m3, m5
punpckhwd m3, m5
punpcklwd m5, m4, m6
@@ -131,58 +130,58 @@ SECTION .text
punpckhwd m7, m9
punpcklwd m9, m8, m10
punpckhwd m8, m10
- punpcklwd m10, m11, m12
- punpckhwd m11, m12
- SWAP m12, m16, m11
- punpcklwd m11, m12, m13
- punpckhwd m12, m13
- ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
+ punpcklwd m10, m11, m25
+ punpckhwd m11, m25
+ SWAP m25, m16, m11
+ punpcklwd m11, m25, m13
+ punpckhwd m25, m13
+ ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25
punpckldq m13, m14, m2
punpckhdq m14, m2
- punpckldq m2, m15, m3
- punpckhdq m15, m3
- punpckldq m3, m1, m5
- punpckhdq m1, m5
- punpckldq m5, m0, m4
- punpckhdq m0, m4
+ punpckldq m2, m22, m3
+ punpckhdq m22, m3
+ punpckldq m3, m26, m5
+ punpckhdq m26, m5
+ punpckldq m5, m24, m4
+ punpckhdq m24, m4
punpckldq m4, m6, m10
punpckhdq m6, m10
punpckldq m10, m9, m11
punpckhdq m9, m11
- punpckldq m11, m8, m12
- punpckhdq m8, m12
- SWAP m12, m16, m8
- punpckldq m8, m7, m12
- punpckhdq m7, m12
- ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
- punpcklqdq m12, m13, m4
+ punpckldq m11, m8, m25
+ punpckhdq m8, m25
+ SWAP m25, m16, m8
+ punpckldq m8, m7, m25
+ punpckhdq m7, m25
+ ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m25, m13, m4
punpckhqdq m13, m4
punpcklqdq m4, m14, m6
punpckhqdq m14, m6
punpcklqdq m6, m2, m8
punpckhqdq m2, m8
- punpcklqdq m8, m15, m7
- punpckhqdq m15, m7
+ punpcklqdq m8, m22, m7
+ punpckhqdq m22, m7
punpcklqdq m7, m3, m10
punpckhqdq m3, m10
- punpcklqdq m10, m1, m9
- punpckhqdq m1, m9
+ punpcklqdq m10, m26, m9
+ punpckhqdq m26, m9
punpcklqdq m9, m5, m11
punpckhqdq m5, m11
SWAP m11, m16
%if %2 == 0
- SWAP m16, m12
+ SWAP m16, m25
%else
- mova %3, m12
+ mova %3, m25
%endif
- punpcklqdq m12, m0, m11
- punpckhqdq m0, m11
+ punpcklqdq m25, m24, m11
+ punpckhqdq m24, m11
%if %2 == 0
SWAP m11, m16
%endif
- ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
- SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
- SWAP 3, 14, 12, 9
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24
+ SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22
+ SWAP 3, 14, 25, 9
%endmacro
%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
@@ -205,7 +204,7 @@ SECTION .text
%endif
lea t0, [dstq+mstrideq*4]
%if %1 != 6
- mova m12, [t0 +strideq*0]
+ mova m25, [t0 +strideq*0]
%endif
mova m13, [t0 +strideq*1]
mova m3, [t0 +strideq*2]
@@ -214,13 +213,13 @@ SECTION .text
mova m6, [dstq+strideq*1]
mova m14, [dstq+strideq*2]
%if %1 != 6
- mova m15, [dstq+stride3q ]
+ mova m22, [dstq+stride3q ]
%endif
%if %1 == 16
lea t0, [dstq+strideq*4]
- mova m19, [t0 +strideq*0]
- mova m20, [t0 +strideq*1]
- mova m21, [t0 +strideq*2]
+ mova m29, [t0 +strideq*0]
+ mova m30, [t0 +strideq*1]
+ mova m31, [t0 +strideq*2]
%endif
%endif
%else ; h
@@ -230,15 +229,15 @@ SECTION .text
vbroadcasti32x4 m0, [hshuf4]
kmovw k1, k6
lea t0, [dstq+strideq*4]
- vpgatherdd m3{k1}, [dstq+m29-2]
+ vpgatherdd m3{k1}, [dstq+m19-2]
kmovw k1, k6
lea t1, [dstq+strideq*8]
- vpgatherdd m4{k1}, [t0 +m29-2]
+ vpgatherdd m4{k1}, [t0 +m19-2]
kmovw k1, k6
lea t2, [t0 +strideq*8]
- vpgatherdd m5{k1}, [t1 +m29-2]
+ vpgatherdd m5{k1}, [t1 +m19-2]
kmovw k1, k6
- vpgatherdd m6{k1}, [t2 +m29-2]
+ vpgatherdd m6{k1}, [t2 +m19-2]
pshufb m3, m0
pshufb m4, m0
pshufb m5, m0
@@ -257,16 +256,16 @@ SECTION .text
%elif %1 == 6 || %1 == 8
kmovb k1, k7
lea t0, [dstq+strideq*1]
- vpgatherdq m3{k1}, [dstq+ym31-%1/2]
+ vpgatherdq m3{k1}, [dstq+ym21-%1/2]
kmovb k1, k7
lea t1, [dstq+strideq*2]
- vpgatherdq m4{k1}, [t0 +ym31-%1/2]
+ vpgatherdq m4{k1}, [t0 +ym21-%1/2]
kmovb k1, k7
lea t2, [dstq+stride3q ]
- vpgatherdq m5{k1}, [t1 +ym31-%1/2]
+ vpgatherdq m5{k1}, [t1 +ym21-%1/2]
kmovb k1, k7
- vextracti32x8 ym0, m31, 1
- vpgatherdq m6{k1}, [t2 +ym31-%1/2]
+ vextracti32x8 ym0, m21, 1
+ vpgatherdq m6{k1}, [t2 +ym21-%1/2]
kmovb k1, k7
vpgatherdq m12{k1}, [dstq+ym0 -%1/2]
kmovb k1, k7
@@ -344,7 +343,7 @@ SECTION .text
punpckhqdq m13, m5, m13
%if %1 == 8
punpcklqdq m5, m7, m12
- punpckhqdq m12, m7, m12
+ punpckhqdq m25, m7, m12
; xm3: A0-15
; xm14: B0-15
; xm15: C0-15
@@ -352,10 +351,11 @@ SECTION .text
; xm4: E0-15
; xm13: F0-15
; xm5: G0-15
- ; xm12: H0-15
- SWAP 12, 3, 15
+ ; xm25: H0-15
+ SWAP 25, 3, 15
SWAP 13, 14, 5, 4, 6
- ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
+ SWAP 15, 22
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22
%else
SWAP 13, 3, 14
SWAP 6, 4, 15, 5
@@ -364,8 +364,8 @@ SECTION .text
%else ; 16, h
; load and 16x16 transpose. We only use 14 pixels but we'll need the
; remainder at the end for the second transpose
- movu xm0, [dstq+strideq*0-8]
- movu xm1, [dstq+strideq*1-8]
+ movu xm24, [dstq+strideq*0-8]
+ movu xm26, [dstq+strideq*1-8]
movu xm2, [dstq+strideq*2-8]
movu xm3, [dstq+stride3q -8]
lea t0, [dstq+strideq*4]
@@ -379,13 +379,13 @@ SECTION .text
movu xm10, [t0 +strideq*2-8]
movu xm11, [t0 +stride3q -8]
lea t0, [t0 +strideq*4]
- movu xm12, [t0 +strideq*0-8]
+ movu xm25, [t0 +strideq*0-8]
movu xm13, [t0 +strideq*1-8]
movu xm14, [t0 +strideq*2-8]
- movu xm15, [t0 +stride3q -8]
+ movu xm22, [t0 +stride3q -8]
lea t0, [t0 +strideq*4]
- vinserti32x4 ym0, [t0 +strideq*0-8], 1
- vinserti32x4 ym1, [t0 +strideq*1-8], 1
+ vinserti32x4 ym24, [t0 +strideq*0-8], 1
+ vinserti32x4 ym26, [t0 +strideq*1-8], 1
vinserti32x4 ym2, [t0 +strideq*2-8], 1
vinserti32x4 ym3, [t0 +stride3q -8], 1
lea t0, [t0 +strideq*4]
@@ -399,13 +399,13 @@ SECTION .text
vinserti32x4 ym10, [t0 +strideq*2-8], 1
vinserti32x4 ym11, [t0 +stride3q -8], 1
lea t0, [t0 +strideq*4]
- vinserti32x4 ym12, [t0 +strideq*0-8], 1
+ vinserti32x4 ym25, [t0 +strideq*0-8], 1
vinserti32x4 ym13, [t0 +strideq*1-8], 1
vinserti32x4 ym14, [t0 +strideq*2-8], 1
- vinserti32x4 ym15, [t0 +stride3q -8], 1
+ vinserti32x4 ym22, [t0 +stride3q -8], 1
lea t0, [t0 +strideq*4]
- vinserti32x4 m0, [t0 +strideq*0-8], 2
- vinserti32x4 m1, [t0 +strideq*1-8], 2
+ vinserti32x4 m24, [t0 +strideq*0-8], 2
+ vinserti32x4 m26, [t0 +strideq*1-8], 2
vinserti32x4 m2, [t0 +strideq*2-8], 2
vinserti32x4 m3, [t0 +stride3q -8], 2
lea t0, [t0 +strideq*4]
@@ -419,13 +419,13 @@ SECTION .text
vinserti32x4 m10, [t0 +strideq*2-8], 2
vinserti32x4 m11, [t0 +stride3q -8], 2
lea t0, [t0 +strideq*4]
- vinserti32x4 m12, [t0 +strideq*0-8], 2
+ vinserti32x4 m25, [t0 +strideq*0-8], 2
vinserti32x4 m13, [t0 +strideq*1-8], 2
vinserti32x4 m14, [t0 +strideq*2-8], 2
- vinserti32x4 m15, [t0 +stride3q -8], 2
+ vinserti32x4 m22, [t0 +stride3q -8], 2
lea t0, [t0 +strideq*4]
- vinserti32x4 m0, [t0 +strideq*0-8], 3
- vinserti32x4 m1, [t0 +strideq*1-8], 3
+ vinserti32x4 m24, [t0 +strideq*0-8], 3
+ vinserti32x4 m26, [t0 +strideq*1-8], 3
vinserti32x4 m2, [t0 +strideq*2-8], 3
vinserti32x4 m3, [t0 +stride3q -8], 3
lea t0, [t0 +strideq*4]
@@ -439,41 +439,38 @@ SECTION .text
vinserti32x4 m10, [t0 +strideq*2-8], 3
vinserti32x4 m11, [t0 +stride3q -8], 3
lea t0, [t0 +strideq*4]
- vinserti32x4 m12, [t0 +strideq*0-8], 3
+ vinserti32x4 m25, [t0 +strideq*0-8], 3
vinserti32x4 m13, [t0 +strideq*1-8], 3
vinserti32x4 m14, [t0 +strideq*2-8], 3
- vinserti32x4 m15, [t0 +stride3q -8], 3
+ vinserti32x4 m22, [t0 +stride3q -8], 3
;
TRANSPOSE_16X16B 0, 1, [rsp+0*64]
- SWAP m16, m1
+ SWAP m16, m26
SWAP m17, m2
SWAP m18, m3
- SWAP m19, m12
- SWAP m20, m13
- SWAP m21, m14
- mova [rsp+4*64], m15
- ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
- SWAP 12, 4, 7
+ SWAP m29, m25
+ SWAP m30, m13
+ SWAP m31, m14
+ mova [rsp+4*64], m22
+ ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22
+ SWAP 25, 4, 7
SWAP 13, 5, 8
SWAP 3, 6, 9
SWAP 10, 14
- SWAP 11, 15
+ SWAP 11, 22
%endif
%endif
; load L/E/I/H
-%if is_uv
- SWAP m22, m15
-%endif
- vpbroadcastd m22, [pb_1]
+ vpbroadcastd m15, [pb_1]
%ifidn %2, v
movu m1, [lq]
movu m0, [lq+l_strideq]
%else
kmovw k1, k6
- vpgatherdd m0{k1}, [lq+m30+4]
+ vpgatherdd m0{k1}, [lq+m20+4]
kmovw k1, k6
- vpgatherdd m1{k1}, [lq+m30+0]
+ vpgatherdd m1{k1}, [lq+m20+0]
%endif
pxor m2, m2
pcmpeqb k1, m0, m2
@@ -484,7 +481,7 @@ SECTION .text
pand m2, [pb_63]{bcstd}
vpbroadcastb m1, [lutq+136]
pminub m2, m1
- pmaxub m2, m22 ; I
+ pmaxub m2, m15 ; I
pand m1, m0, [pb_240]{bcstd}
psrlq m1, 4 ; H
paddd m0, [pb_2]{bcstd}
@@ -500,7 +497,7 @@ SECTION .text
ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
pmaxub m9, m8
%else
- ABSSUB m9, m12, m4, m10 ; abs(p3-p0)
+ ABSSUB m9, m25, m4, m10 ; abs(p3-p0)
pmaxub m9, m8
ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
pmaxub m9, m10
@@ -508,17 +505,17 @@ SECTION .text
ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
pmaxub m9, m10
%if %1 != 6
- ABSSUB m10, m5, m15, m11 ; abs(q3-q0)
+ ABSSUB m10, m5, m22, m11 ; abs(q3-q0)
pmaxub m9, m10
%endif
- vpcmpub k2{k3}, m9, m22, 2 ; le ; flat8in
+ vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in
%if %1 == 6
ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
%else
- ABSSUB m10, m12, m13, m11 ; abs(p3-p2)
+ ABSSUB m10, m25, m13, m11 ; abs(p3-p2)
ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
pmaxub m10, m11
- ABSSUB m11, m14, m15, m1 ; abs(q3-q2)
+ ABSSUB m11, m14, m22, m1 ; abs(q3-q2)
pmaxub m10, m11
%endif
ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
@@ -526,16 +523,10 @@ SECTION .text
%if %1 == 16
vpbroadcastd m11, [maskq+8]
por m11, [maskq+4]{bcstd}
- pand m11, pbmask
%else
- %if !is_h || %1 == 6
- pand m11, pbmask, [maskq+4]{bcstd}
- %else
vpbroadcastd m11, [maskq+4]
- pand m11, pbmask
- %endif
%endif
- pcmpeqd k4, m11, pbmask
+ vptestmd k4, m11, pbmask
vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks
pmaxub m8, m10
%endif
@@ -554,77 +545,58 @@ SECTION .text
pmaxub m1, m2
ABSSUB m2, m18, m4, m10
pmaxub m1, m2
- ABSSUB m2, m19, m5, m10
+ ABSSUB m2, m29, m5, m10
pmaxub m1, m2
- ABSSUB m2, m20, m5, m10
+ ABSSUB m2, m30, m5, m10
pmaxub m1, m2
- ABSSUB m2, m21, m5, m10
+ ABSSUB m2, m31, m5, m10
pmaxub m1, m2
- ;
- vpcmpub k4, m1, m22, 2 ; flat8out
- kandq k4, k4, k2 ; flat8in & flat8out
-
+ kandq k2, k2, k3
+ vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out
vpbroadcastd m2, [maskq+8]
- pand m10, m2, pbmask
- pcmpeqd k5, m10, pbmask
+ vptestmd k5, m2, pbmask
vpmovm2d m7, k5
- vpmovb2m k5, m7
- kandq k4, k4, k5 ; flat16
- kandq k4, k3, k4 ; flat16 & fm
+ vptestmb k4{k4}, m7, m7 ; flat16 & fm
por m10, m2, [maskq+4]{bcstd}
- pand m2, m10, pbmask
- pcmpeqd k5, m2, pbmask
+ vptestmd k5, m10, pbmask
vpmovm2d m7, k5
- vpmovb2m k5, m7
- kandq k2, k2, k5 ; flat8in
- kandq k2, k3, k2
+ vptestmb k2{k2}, m7, m7 ; flat8in
por m2, m10, [maskq+0]{bcstd}
- pand m2, pbmask
- pcmpeqd k5, m2, pbmask
+ vptestmd k5, m2, pbmask
vpmovm2d m7, k5
- vpmovb2m k5, m7
- kandq k3, k3, k5
+ vptestmb k3{k3}, m7, m7
kandnq k3, k2, k3 ; fm & !flat8 & !flat16
kandnq k2, k4, k2 ; flat8 & !flat16
%elif %1 != 4
vpbroadcastd m0, [maskq+4]
- pand m2, m0, pbmask
- pcmpeqd k4, m2, pbmask
+ vptestmd k4, m0, pbmask
vpmovm2d m7, k4
- vpmovb2m k4, m7
- kandq k2, k2, k4
+ vptestmb k2{k2}, m7, m7
kandq k2, k2, k3 ; flat8 & fm
por m0, [maskq+0]{bcstd}
- pand m0, pbmask
- pcmpeqd k4, m0, pbmask
+ vptestmd k4, m0, pbmask
vpmovm2d m7, k4
- vpmovb2m k4, m7
- kandq k3, k3, k4
+ vptestmb k3{k3}, m7, m7
kandnq k3, k2, k3 ; fm & !flat8
%else
%ifidn %2, v
- pand m0, pbmask, [maskq+0]{bcstd}
+ vptestmd k4, pbmask, [maskq+0]{bcstd}
%else
vpbroadcastd m0, [maskq+0]
- pand m0, pbmask
+ vptestmd k4, m0, pbmask
%endif
- pcmpeqd k4, m0, pbmask
vpmovm2d m7, k4
- vpmovb2m k4, m7
- kandq k3, k3, k4 ; fm
+ vptestmb k3{k3}, m7, m7 ; fm
%endif
; short filter
-%if is_uv
- SWAP m23, m22
- SWAP m24, m0
- SWAP m25, m12
- SWAP m26, m1
+%if %1 >= 8
+ SWAP m23, m15
%endif
- vpbroadcastd m23, [pb_3]
- vpbroadcastd m24, [pb_4]
- vpbroadcastd m25, [pb_16]
- vpbroadcastd m26, [pb_64]
+ vpbroadcastd m15, [pb_3]
+ vpbroadcastd m0, [pb_4]
+ vpbroadcastd m12, [pb_16]
+ vpbroadcastd m1, [pb_64]
pxor m3, pb128
pxor m6, pb128
psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev
@@ -634,16 +606,16 @@ SECTION .text
paddsb m10, m11
paddsb m10, m11
paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm
- paddsb m8, m10, m23
- paddsb m10, m24
+ paddsb m8, m10, m15
+ paddsb m10, m0
pand m8, [pb_248]{bcstd}
pand m10, [pb_248]{bcstd}
psrlq m8, 3
psrlq m10, 3
- pxor m8, m25
- pxor m10, m25
- psubb m8, m25 ; f2
- psubb m10, m25 ; f1
+ pxor m8, m12
+ pxor m10, m12
+ psubb m8, m12 ; f2
+ psubb m10, m12 ; f1
paddsb m4, m8
psubsb m5, m10
pxor m4, pb128
@@ -652,7 +624,7 @@ SECTION .text
pxor m10, pb128
pxor m8, m8
pavgb m8, m10 ; f=(f1+1)>>1
- psubb m8, m26
+ psubb m8, m1
knotq k1, k1
paddsb m3{k1}, m3, m8
psubsb m6{k1}, m6, m8
@@ -664,40 +636,40 @@ SECTION .text
%ifidn %2, v
lea t0, [dstq+mstrideq*8]
%endif
- SWAP m0, m16, m14
- SWAP m2, m17, m15
+ SWAP m24, m16, m14
+ SWAP m2, m17, m22
SWAP m7, m18
; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
; write -6
- vpbroadcastd m26, [pb_7_1]
- vpbroadcastd m25, [pb_2]
- punpcklbw m14, m0, m12
- punpckhbw m15, m0, m12
- pmaddubsw m10, m14, m26
- pmaddubsw m11, m15, m26 ; p6*7+p3
+ vpbroadcastd m1, [pb_7_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m14, m24, m25
+ punpckhbw m22, m24, m25
+ pmaddubsw m10, m14, m1
+ pmaddubsw m11, m22, m1 ; p6*7+p3
punpcklbw m8, m2, m7
punpckhbw m9, m2, m7
- pmaddubsw m8, m25
- pmaddubsw m9, m25
+ pmaddubsw m8, m12
+ pmaddubsw m9, m12
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3
%ifidn %2, h
vpbroadcastd m27, [pw_2048]
- vpbroadcastd m26, [pb_m1_1]
+ vpbroadcastd m1, [pb_m1_1]
%define pw2048 m27
- %define pbm1_1 m26
+ %define pbm1_1 m1
%endif
punpcklbw m8, m13, m3
punpckhbw m9, m13, m3
- pmaddubsw m8, m22
- pmaddubsw m9, m22
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
punpcklbw m8, m4, m5
punpckhbw m9, m4, m5
- pmaddubsw m8, m22
- pmaddubsw m9, m22
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
pmulhrsw m8, m10, pw2048
@@ -713,17 +685,17 @@ SECTION .text
; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
; write -5
pmaddubsw m14, pbm1_1
- pmaddubsw m15, pbm1_1
+ pmaddubsw m22, pbm1_1
paddw m10, m14
- paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
- punpcklbw m8, m0, m6
- punpckhbw m9, m0, m6
+ paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m24, m6
+ punpckhbw m9, m24, m6
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
SWAP m18, m8
- SWAP m22, m9
+ SWAP m23, m9
pmulhrsw m8, m10, pw2048
pmulhrsw m9, m11, pw2048
packuswb m8, m9
@@ -737,8 +709,8 @@ SECTION .text
; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
; write -4
SWAP m14, m16
- punpcklbw m8, m0, m13
- punpckhbw m9, m0, m13
+ punpcklbw m8, m24, m13
+ punpckhbw m9, m24, m13
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
@@ -756,21 +728,21 @@ SECTION .text
%ifidn %2, v
vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3
%else
- vpblendmb m8{k4}, m12, m8
+ vpblendmb m8{k4}, m25, m8
mova [rsp+3*64], m8
%endif
; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
; write -3
- SWAP m15, m17
- punpcklbw m8, m0, m3
- punpckhbw m9, m0, m3
+ SWAP m22, m17
+ punpcklbw m8, m24, m3
+ punpckhbw m9, m24, m3
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
- punpcklbw m8, m7, m15
- punpckhbw m7, m15
+ punpcklbw m8, m7, m22
+ punpckhbw m7, m22
pmaddubsw m8, pbm1_1
pmaddubsw m7, pbm1_1
paddw m10, m8
@@ -779,69 +751,69 @@ SECTION .text
pmulhrsw m8, m10, pw2048
pmulhrsw m9, m11, pw2048
packuswb m8, m9
- vpblendmb m23{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
+ vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
; write -2
%ifidn %2, v
lea t0, [dstq+strideq*4]
%endif
- punpcklbw m8, m0, m4
- punpckhbw m9, m0, m4
+ punpcklbw m8, m24, m4
+ punpckhbw m9, m24, m4
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
- punpcklbw m8, m12, m19
- punpckhbw m9, m12, m19
- SWAP m1, m19
+ punpcklbw m8, m25, m29
+ punpckhbw m9, m25, m29
+ SWAP m26, m29
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
- SWAP m19, m8
- SWAP m24, m9
+ SWAP m29, m8
+ SWAP m0, m9
pmulhrsw m8, m10, pw2048
pmulhrsw m9, m11, pw2048
packuswb m8, m9
- vpblendmb m25{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
+ vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
; write -1
%ifidn %2, h
- SWAP m28, m0
+ SWAP m28, m24
punpcklbw m8, m28, m5
- punpckhbw m0, m28, m5
+ punpckhbw m24, m28, m5
%else
- punpcklbw m8, m0, m5
- punpckhbw m0, m5
+ punpcklbw m8, m24, m5
+ punpckhbw m24, m5
%endif
pmaddubsw m8, pbm1_1
- pmaddubsw m0, pbm1_1
+ pmaddubsw m24, pbm1_1
paddw m10, m8
- paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
- punpcklbw m0, m13, m20
- punpckhbw m9, m13, m20
+ paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m24, m13, m30
+ punpckhbw m9, m13, m30
%ifidn %2, h
- SWAP m27, m20
+ SWAP m27, m30
%endif
- SWAP m13, m23
- pmaddubsw m0, pbm1_1
+ SWAP m13, m15
+ pmaddubsw m24, pbm1_1
pmaddubsw m9, pbm1_1
- paddw m10, m0
+ paddw m10, m24
paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
- SWAP m20, m0
- SWAP m23, m9
+ SWAP m30, m24
+ SWAP m15, m9
%ifidn %2, h
- SWAP m9, m0
+ SWAP m9, m24
%define pw2048 m9
%endif
- pmulhrsw m0, m10, pw2048
+ pmulhrsw m24, m10, pw2048
pmulhrsw m8, m11, pw2048
paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
- paddw m11, m22
- packuswb m0, m8
- punpcklbw m8, m3, m21
+ paddw m11, m23
+ packuswb m24, m8
+ punpcklbw m8, m3, m31
pmaddubsw m8, pbm1_1
paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
SWAP m18, m8
@@ -851,34 +823,34 @@ SECTION .text
SWAP m16, m9
%define pw2048 m16
%endif
- punpckhbw m9, m3, m21
- SWAP m3, m25
+ punpckhbw m9, m3, m31
+ SWAP m3, m12
pmaddubsw m9, pbm1_1
paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
- SWAP m22, m9
+ SWAP m23, m9
pmulhrsw m9, m11, pw2048
paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
%ifidn %2, h
- SWAP m2, m26
+ SWAP m2, m1
%define pbm1_1 m2
%endif
- vpblendmb m26{k4}, m4, m0 ; don't clobber p0/m4 since we need it in H
+ vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H
; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
; write +0
- SWAP m0, m21 ; q6
+ SWAP m24, m31 ; q6
packuswb m8, m9
%ifidn %2, h
- SWAP m21, m2
- %define pbm1_1 m21
+ SWAP m31, m2
+ %define pbm1_1 m31
%endif
- vpblendmb m25{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
+ vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
; write +1
- punpcklbw m8, m4, m0
- punpckhbw m2, m4, m0
- SWAP m4, m26
+ punpcklbw m8, m4, m24
+ punpckhbw m2, m4, m24
+ SWAP m4, m1
pmaddubsw m8, pbm1_1
pmaddubsw m2, pbm1_1
paddw m10, m8
@@ -892,9 +864,9 @@ SECTION .text
; write +2
paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
paddw m11, m7
- punpcklbw m8, m5, m0
- punpckhbw m9, m5, m0
- SWAP m5, m25
+ punpcklbw m8, m5, m24
+ punpckhbw m9, m5, m24
+ SWAP m5, m12
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
@@ -906,10 +878,10 @@ SECTION .text
; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
; write +3
- paddw m10, m19 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
- paddw m11, m24
- punpcklbw m8, m6, m0
- punpckhbw m9, m6, m0
+ paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ paddw m11, m0
+ punpcklbw m8, m6, m24
+ punpckhbw m9, m6, m24
SWAP 2, 6
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
@@ -921,20 +893,20 @@ SECTION .text
%ifidn %2, v
vmovdqu8 [t0+mstrideq]{k4}, m8
%else
- SWAP m19, m16
- %define pw2048 m19
- vpblendmb m16{k4}, m15, m8
+ SWAP m29, m16
+ %define pw2048 m29
+ vpblendmb m16{k4}, m22, m8
%endif
; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
; write +4
- paddw m10, m20 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
- paddw m11, m23
+ paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m15
%ifidn %2, h
- SWAP m23, m8
+ SWAP m15, m8
%endif
- punpcklbw m8, m14, m0
- punpckhbw m9, m14, m0
+ punpcklbw m8, m14, m24
+ punpckhbw m9, m14, m24
SWAP 14, 7
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
@@ -946,16 +918,16 @@ SECTION .text
%ifidn %2, v
vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4
%else
- vpblendmb m17{k4}, m1, m8
+ vpblendmb m17{k4}, m26, m8
%endif
; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
; write +5
paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
- paddw m11, m22
- punpcklbw m8, m15, m0
- punpckhbw m9, m15, m0
- SWAP m20, m0
+ paddw m11, m23
+ punpcklbw m8, m22, m24
+ punpckhbw m9, m22, m24
+ SWAP m30, m24
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
@@ -979,26 +951,26 @@ SECTION .text
vpbroadcastd m9, [pb_3_1]
vpbroadcastd m10, [pb_2_1]
%if %1 == 16
- vpbroadcastd m22, [pb_1]
- vpbroadcastd m24, [pb_4]
+ vpbroadcastd m23, [pb_1]
+ vpbroadcastd m0, [pb_4]
%elifidn %2, h
- vpbroadcastd m21, [pb_m1_1]
- %define pbm1_1 m21
+ vpbroadcastd m31, [pb_m1_1]
+ %define pbm1_1 m31
%endif
- punpcklbw m0, m12, m3
- punpckhbw m1, m12, m3
- pmaddubsw m2, m0, m9
- pmaddubsw m7, m1, m9 ; 3 * p3 + p1
+ punpcklbw m24, m25, m3
+ punpckhbw m26, m25, m3
+ pmaddubsw m2, m24, m9
+ pmaddubsw m7, m26, m9 ; 3 * p3 + p1
punpcklbw m8, m13, m4
punpckhbw m11, m13, m4
pmaddubsw m8, m10
pmaddubsw m11, m10
paddw m2, m8
paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
- punpcklbw m8, m5, m24
- punpckhbw m11, m5, m24
- pmaddubsw m8, m22
- pmaddubsw m11, m22
+ punpcklbw m8, m5, m0
+ punpckhbw m11, m5, m0
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
paddw m2, m8
paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
psrlw m8, m2, 3
@@ -1015,8 +987,8 @@ SECTION .text
%endif
%endif
- pmaddubsw m8, m0, pbm1_1
- pmaddubsw m11, m1, pbm1_1
+ pmaddubsw m8, m24, pbm1_1
+ pmaddubsw m11, m26, pbm1_1
paddw m2, m8
paddw m7, m11
punpcklbw m8, m13, m6
@@ -1035,14 +1007,14 @@ SECTION .text
SWAP m18, m8
%endif
- pmaddubsw m0, m22
- pmaddubsw m1, m22
- psubw m2, m0
- psubw m7, m1
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
punpcklbw m8, m4, m14
punpckhbw m11, m4, m14
- pmaddubsw m8, m22
- pmaddubsw m11, m22
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
paddw m2, m8
paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
psrlw m8, m2, 3
@@ -1052,19 +1024,19 @@ SECTION .text
%ifidn %2, v
mova [t0+stride3q], m8
%else
- SWAP m19, m8
+ SWAP m29, m8
%endif
- punpcklbw m0, m5, m15
- punpckhbw m1, m5, m15
- pmaddubsw m8, m0, m22
- pmaddubsw m11, m1, m22
+ punpcklbw m24, m5, m22
+ punpckhbw m26, m5, m22
+ pmaddubsw m8, m24, m23
+ pmaddubsw m11, m26, m23
paddw m2, m8
paddw m7, m11
- punpcklbw m8, m4, m12
- punpckhbw m11, m4, m12
- pmaddubsw m8, m22
- pmaddubsw m11, m22
+ punpcklbw m8, m4, m25
+ punpckhbw m11, m4, m25
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
psubw m2, m8
psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
psrlw m8, m2, 3
@@ -1075,10 +1047,10 @@ SECTION .text
mova [dstq+strideq*0], m11
%endif
- pmaddubsw m0, pbm1_1
- pmaddubsw m1, pbm1_1
- paddw m2, m0
- paddw m7, m1
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m26, pbm1_1
+ paddw m2, m24
+ paddw m7, m26
punpcklbw m8, m13, m6
punpckhbw m13, m6
pmaddubsw m8, pbm1_1
@@ -1093,18 +1065,18 @@ SECTION .text
mova [dstq+strideq*1], m13
%endif
- punpcklbw m0, m3, m6
- punpckhbw m1, m3, m6
- pmaddubsw m0, m22
- pmaddubsw m1, m22
- psubw m2, m0
- psubw m7, m1
- punpcklbw m0, m14, m15
- punpckhbw m1, m14, m15
- pmaddubsw m0, m22
- pmaddubsw m1, m22
- paddw m2, m0
- paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ punpcklbw m24, m3, m6
+ punpckhbw m26, m3, m6
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m24, m14, m22
+ punpckhbw m26, m14, m22
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ paddw m2, m24
+ paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
psrlw m2, 3
psrlw m7, 3
packuswb m2, m7
@@ -1120,36 +1092,36 @@ SECTION .text
%endif
%ifidn %2, h
- SWAP m0, m18
- SWAP m1, m19
+ SWAP m24, m18
+ SWAP m26, m29
%if %1 == 8
; 16x8 transpose
- punpcklbw m3, m12, m10
- punpckhbw m12, m10
- punpcklbw m10, m0, m1
- punpckhbw m0, m1
- punpcklbw m1, m11, m13
+ punpcklbw m3, m25, m10
+ punpckhbw m25, m10
+ punpcklbw m10, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m11, m13
punpckhbw m11, m13
- punpcklbw m13, m2, m15
- punpckhbw m2, m15
+ punpcklbw m13, m2, m22
+ punpckhbw m2, m22
;
- punpcklwd m15, m3, m10
+ punpcklwd m22, m3, m10
punpckhwd m3, m10
- punpcklwd m10, m12, m0
- punpckhwd m12, m0
- punpcklwd m0, m1, m13
- punpckhwd m1, m13
+ punpcklwd m10, m25, m24
+ punpckhwd m25, m24
+ punpcklwd m24, m26, m13
+ punpckhwd m26, m13
punpcklwd m13, m11, m2
punpckhwd m11, m2
;
- punpckldq m2, m15, m0
- punpckhdq m15, m0
- punpckldq m0, m3, m1
- punpckhdq m3, m1
- punpckldq m1, m10, m13
+ punpckldq m2, m22, m24
+ punpckhdq m22, m24
+ punpckldq m24, m3, m26
+ punpckhdq m3, m26
+ punpckldq m26, m10, m13
punpckhdq m10, m13
- punpckldq m13, m12, m11
- punpckhdq m12, m11
+ punpckldq m13, m25, m11
+ punpckhdq m25, m11
; write 8x32
vpbroadcastd ym16, strided
pmulld ym16, [hmulD]
@@ -1162,8 +1134,8 @@ SECTION .text
kmovb k3, k6
kmovb k4, k6
vpscatterdq [dstq+ym16-4]{k1}, m2
- vpscatterdq [t1 +ym16-4]{k2}, m15
- vpscatterdq [t2 +ym16-4]{k3}, m0
+ vpscatterdq [t1 +ym16-4]{k2}, m22
+ vpscatterdq [t2 +ym16-4]{k3}, m24
vpscatterdq [t3 +ym16-4]{k4}, m3
lea t1, [t0+strideq*2]
lea t2, [t0+strideq*4]
@@ -1172,29 +1144,29 @@ SECTION .text
kmovb k2, k6
kmovb k3, k6
kmovb k4, k6
- vpscatterdq [t0+ym16-4]{k1}, m1
+ vpscatterdq [t0+ym16-4]{k1}, m26
vpscatterdq [t1+ym16-4]{k2}, m10
vpscatterdq [t2+ym16-4]{k3}, m13
- vpscatterdq [t3+ym16-4]{k4}, m12
+ vpscatterdq [t3+ym16-4]{k4}, m25
%else
; 16x16 transpose and store
SWAP 5, 10, 2
- SWAP 6, 0
- SWAP 7, 1
+ SWAP 6, 24
+ SWAP 7, 26
SWAP 8, 11
SWAP 9, 13
- mova m0, [rsp+0*64]
- SWAP m1, m28
+ mova m24, [rsp+0*64]
+ SWAP m26, m28
mova m2, [rsp+1*64]
mova m3, [rsp+2*64]
mova m4, [rsp+3*64]
SWAP m11, m16
- SWAP m12, m17
+ SWAP m25, m17
SWAP m13, m27
- SWAP m14, m20
+ SWAP m14, m30
TRANSPOSE_16X16B 1, 0, [rsp+4*64]
- movu [dstq+strideq*0-8], xm0
- movu [dstq+strideq*1-8], xm1
+ movu [dstq+strideq*0-8], xm24
+ movu [dstq+strideq*1-8], xm26
movu [dstq+strideq*2-8], xm2
movu [dstq+stride3q -8], xm3
lea t0, [dstq+strideq*4]
@@ -1208,13 +1180,13 @@ SECTION .text
movu [t0+strideq*2-8], xm10
movu [t0+stride3q -8], xm11
lea t0, [t0+strideq*4]
- movu [t0+strideq*0-8], xm12
+ movu [t0+strideq*0-8], xm25
movu [t0+strideq*1-8], xm13
movu [t0+strideq*2-8], xm14
- movu [t0+stride3q -8], xm15
+ movu [t0+stride3q -8], xm22
lea t0, [t0+strideq*4]
- vextracti128 [t0+strideq*0-8], ym0, 1
- vextracti128 [t0+strideq*1-8], ym1, 1
+ vextracti128 [t0+strideq*0-8], ym24, 1
+ vextracti128 [t0+strideq*1-8], ym26, 1
vextracti128 [t0+strideq*2-8], ym2, 1
vextracti128 [t0+stride3q -8], ym3, 1
lea t0, [t0+strideq*4]
@@ -1228,13 +1200,13 @@ SECTION .text
vextracti128 [t0+strideq*2-8], ym10, 1
vextracti128 [t0+stride3q -8], ym11, 1
lea t0, [t0+strideq*4]
- vextracti128 [t0+strideq*0-8], ym12, 1
+ vextracti128 [t0+strideq*0-8], ym25, 1
vextracti128 [t0+strideq*1-8], ym13, 1
vextracti128 [t0+strideq*2-8], ym14, 1
- vextracti128 [t0+stride3q -8], ym15, 1
+ vextracti128 [t0+stride3q -8], ym22, 1
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m0, 2
- vextracti32x4 [t0+strideq*1-8], m1, 2
+ vextracti32x4 [t0+strideq*0-8], m24, 2
+ vextracti32x4 [t0+strideq*1-8], m26, 2
vextracti32x4 [t0+strideq*2-8], m2, 2
vextracti32x4 [t0+stride3q -8], m3, 2
lea t0, [t0+strideq*4]
@@ -1248,13 +1220,13 @@ SECTION .text
vextracti32x4 [t0+strideq*2-8], m10, 2
vextracti32x4 [t0+stride3q -8], m11, 2
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m12, 2
+ vextracti32x4 [t0+strideq*0-8], m25, 2
vextracti32x4 [t0+strideq*1-8], m13, 2
vextracti32x4 [t0+strideq*2-8], m14, 2
- vextracti32x4 [t0+stride3q -8], m15, 2
+ vextracti32x4 [t0+stride3q -8], m22, 2
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m0, 3
- vextracti32x4 [t0+strideq*1-8], m1, 3
+ vextracti32x4 [t0+strideq*0-8], m24, 3
+ vextracti32x4 [t0+strideq*1-8], m26, 3
vextracti32x4 [t0+strideq*2-8], m2, 3
vextracti32x4 [t0+stride3q -8], m3, 3
lea t0, [t0+strideq*4]
@@ -1268,19 +1240,15 @@ SECTION .text
vextracti32x4 [t0+strideq*2-8], m10, 3
vextracti32x4 [t0+stride3q -8], m11, 3
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m12, 3
+ vextracti32x4 [t0+strideq*0-8], m25, 3
vextracti32x4 [t0+strideq*1-8], m13, 3
vextracti32x4 [t0+strideq*2-8], m14, 3
- vextracti32x4 [t0+stride3q -8], m15, 3
+ vextracti32x4 [t0+stride3q -8], m22, 3
%endif
%endif
%elif %1 == 6
; flat6 filter
- SWAP m15, m23
- SWAP m0, m24
- SWAP m12, m25
- SWAP m1, m26
vpbroadcastd m15, [pb_3_1]
vpbroadcastd m12, [pb_2]
punpcklbw m8, m13, m5
@@ -1381,17 +1349,16 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
mov mstrideq, strideq
neg mstrideq
lea stride3q, [strideq*3]
- mova m31, [pb_4x0_4x4_4x8_4x12]
- mova m30, [pb_mask]
- vpbroadcastd m29, [pb_128]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
vpbroadcastd m28, [pb_m1_1]
vpbroadcastd m27, [pw_2048]
- %define pbshuf m31
- %define pbmask m30
- %define pb128 m29
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
%define pbm1_1 m28
%define pw2048 m27
- %define is_uv 0
.loop:
cmp word [maskq+8], 0 ; vmask[2]
@@ -1411,7 +1378,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call .v4
.end:
add lq, 64
@@ -1420,6 +1387,11 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
sub wd, 16
jg .loop
RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.v4:
+ FILTER 4, v
+ ret
cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
lut, h, stride3, stride8
@@ -1429,11 +1401,11 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
lea stride3q, [strideq*3]
lea stride8q, [strideq*8]
kxnorw k6, k6, k6
- vpbroadcastd m29, strided
- vpbroadcastd m30, l_strided
- pmulld m31, m29, [hmulA]
- pmulld m30, m30, [hmulB]
- pmulld m29, m29, [hmulC]
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
%define pbshuf [pb_4x0_4x4_4x8_4x12]
%define pbmask [pb_mask]
%define pb128 [pb_128]{bcstd}
@@ -1457,7 +1429,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, h
+ call .h4
.end:
lea lq, [lq+l_strideq*8]
@@ -1466,9 +1438,13 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
sub hd, 16
jg .loop
RET
+ALIGN function_align
RESET_MM_PERMUTATION
+.h4:
+ FILTER 4, h
+ ret
-cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \
lut, w, stride3, mstride
DECLARE_REG_TMP 9
shl l_strideq, 2
@@ -1476,16 +1452,15 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \
mov mstrideq, strideq
neg mstrideq
lea stride3q, [strideq*3]
- mova m20, [pb_4x0_4x4_4x8_4x12]
- mova m19, [pb_mask]
- vpbroadcastd m18, [pb_128]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
vpbroadcastd m17, [pb_m1_1]
vpbroadcastd m16, [pw_4096]
- %define pbshuf m20
- %define pbmask m19
- %define pb128 m18
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
%define pbm1_1 m17
- %define is_uv 1
.loop:
cmp word [maskq+4], 0 ; vmask[1]
@@ -1498,7 +1473,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4
.end:
add lq, 64
@@ -1525,17 +1500,14 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
vpbroadcastd m19, strided
vpbroadcastd m20, l_strided
pmulld m21, m19, [hmulA]
- pmulld m20, m20, [hmulB]
- pmulld m19, m19, [hmulC]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
mova m18, [pb_mask]
vpbroadcastd m17, [pb_128]
vpbroadcastd m16, [pw_4096]
%define pbshuf [pb_4x0_4x4_4x8_4x12]
%define pbmask m18
%define pb128 m17
- %xdefine m31 m21
- %xdefine m30 m20
- %xdefine m29 m19
add l_strideq, l_strideq
.loop:
@@ -1549,7 +1521,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, h
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4
.end:
lea lq, [lq+l_strideq*8]
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h
new file mode 100644
index 00000000000..de23be8866c
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#include "common/intops.h"
+
+#define decl_wiener_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
+decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
+
+#define decl_sgr_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext))
+
+decl_wiener_filter_fns(sse2);
+decl_wiener_filter_fns(ssse3);
+decl_wiener_filter_fns(avx2);
+decl_wiener_filter_fns(avx512icl);
+decl_sgr_filter_fns(ssse3);
+decl_sgr_filter_fns(avx2);
+decl_sgr_filter_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+#if BITDEPTH == 8
+ c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
+ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3);
+ }
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl);
+#if BITDEPTH == 8
+ /* With VNNI we don't need a 5-tap version. */
+ c->wiener[1] = c->wiener[0];
+#else
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl);
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl);
+ }
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm
index 5669ce66d8f..1e571774caf 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm
@@ -329,11 +329,11 @@ ALIGN function_align
packuswb m2, m4
psrlw m2, 8
vpackuswb m2{k2}, m3, m5
- mova [dstq+r10], m2
- add r10, 64
- jl .hv_loop
- mov t6, t5
- mov t5, t4
+ movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
+ add r10, 64 ; function is used for chroma as well, and in some
+ jl .hv_loop ; esoteric edge cases chroma dst pointers may only
+ mov t6, t5 ; have a 32-byte alignment despite having a width
+ mov t5, t4 ; larger than 32, so use an unaligned store here.
mov t4, t3
mov t3, t2
mov t2, t1
@@ -379,7 +379,7 @@ ALIGN function_align
packuswb m0, m2
psrlw m0, 8
vpackuswb m0{k2}, m1, m3
- mova [dstq+r10], m0
+ movu [dstq+r10], m0
add r10, 64
jl .v_loop
mov t6, t5
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc.h b/chromium/third_party/dav1d/libdav1d/src/x86/mc.h
new file mode 100644
index 00000000000..65c607e180c
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018-2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/mc.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(name, sse2)); \
+ decl_##type##_fn(BF(name, ssse3)); \
+ decl_##type##_fn(BF(name, avx2)); \
+ decl_##type##_fn(BF(name, avx512icl));
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+#define init_mc_scaled_fn(type, name, suffix) \
+ c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_scaled_fn(type, name, suffix) \
+ c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
+
+decl_fn(mc, dav1d_put_8tap_regular);
+decl_fn(mc, dav1d_put_8tap_regular_smooth);
+decl_fn(mc, dav1d_put_8tap_regular_sharp);
+decl_fn(mc, dav1d_put_8tap_smooth);
+decl_fn(mc, dav1d_put_8tap_smooth_regular);
+decl_fn(mc, dav1d_put_8tap_smooth_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp_regular);
+decl_fn(mc, dav1d_put_8tap_sharp_smooth);
+decl_fn(mc, dav1d_put_bilin);
+
+decl_fn(mct, dav1d_prep_8tap_regular);
+decl_fn(mct, dav1d_prep_8tap_regular_smooth);
+decl_fn(mct, dav1d_prep_8tap_regular_sharp);
+decl_fn(mct, dav1d_prep_8tap_smooth);
+decl_fn(mct, dav1d_prep_8tap_smooth_regular);
+decl_fn(mct, dav1d_prep_8tap_smooth_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp_regular);
+decl_fn(mct, dav1d_prep_8tap_sharp_smooth);
+decl_fn(mct, dav1d_prep_bilin);
+
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth);
+decl_fn(mc_scaled, dav1d_put_bilin_scaled);
+
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth);
+decl_fn(mct_scaled, dav1d_prep_bilin_scaled);
+
+decl_fn(avg, dav1d_avg);
+decl_fn(w_avg, dav1d_w_avg);
+decl_fn(mask, dav1d_mask);
+decl_fn(w_mask, dav1d_w_mask_420);
+decl_fn(w_mask, dav1d_w_mask_422);
+decl_fn(w_mask, dav1d_w_mask_444);
+decl_fn(blend, dav1d_blend);
+decl_fn(blend_dir, dav1d_blend_v);
+decl_fn(blend_dir, dav1d_blend_h);
+
+decl_fn(warp8x8, dav1d_warp_affine_8x8);
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
+decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
+
+decl_fn(emu_edge, dav1d_emu_edge);
+
+decl_fn(resize, dav1d_resize);
+
+static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
+ return;
+
+#if BITDEPTH == 8
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
+
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
+#endif
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ c->avg = BF(dav1d_avg, ssse3);
+ c->w_avg = BF(dav1d_w_avg, ssse3);
+ c->mask = BF(dav1d_mask, ssse3);
+ c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
+ c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
+ c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
+ c->blend = BF(dav1d_blend, ssse3);
+ c->blend_v = BF(dav1d_blend_v, ssse3);
+ c->blend_h = BF(dav1d_blend_h, ssse3);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
+ c->emu_edge = BF(dav1d_emu_edge, ssse3);
+ c->resize = BF(dav1d_resize, ssse3);
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
+ return;
+
+#if BITDEPTH == 8
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ c->avg = BF(dav1d_avg, avx2);
+ c->w_avg = BF(dav1d_w_avg, avx2);
+ c->mask = BF(dav1d_mask, avx2);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
+ c->blend = BF(dav1d_blend, avx2);
+ c->blend_v = BF(dav1d_blend_v, avx2);
+ c->blend_h = BF(dav1d_blend_h, avx2);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
+ c->emu_edge = BF(dav1d_emu_edge, avx2);
+ c->resize = BF(dav1d_resize, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+ return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ c->avg = BF(dav1d_avg, avx512icl);
+ c->w_avg = BF(dav1d_w_avg, avx512icl);
+ c->mask = BF(dav1d_mask, avx512icl);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
+ c->blend = BF(dav1d_blend, avx512icl);
+ c->blend_v = BF(dav1d_blend_v, avx512icl);
+ c->blend_h = BF(dav1d_blend_h, avx512icl);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
+ c->resize = BF(dav1d_resize, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm
index e83b18ad969..585ba53e080 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm
@@ -1604,7 +1604,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
vpbroadcastd m11, [buf+ 4]
vpbroadcastd m12, [buf+ 8]
vpbroadcastd m13, [buf+12]
- cmp wd, 16
+ sub wd, 16
je .h_w16
jg .h_w32
.h_w8:
@@ -3615,32 +3615,32 @@ ALIGN function_align
.w4:
movq [dstq ], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq ], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq ], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm0, ym1, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ vextracti32x4 xm0, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq ], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
@@ -3860,33 +3860,33 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermb m3, m15, m3
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
mova [maskq], xm3
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8:
@@ -4090,32 +4090,32 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
@@ -4249,32 +4249,32 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm
index eb3ca1c427d..7897f1decc1 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm
@@ -449,9 +449,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pshufb ym0, ym4
pmaddubsw ym0, ym5
pmulhrsw ym0, ym3
- vpmovuswb xmm0, ym0
- movq [dstq+dsq*0], xmm0
- movhps [dstq+dsq*1], xmm0
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
@@ -755,9 +755,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pmulhw ym1, ym6
paddw ym1, ym2
pmulhrsw ym1, ym7
- vpmovuswb xmm1, ym1
- movq [dstq+dsq*0], xmm1
- movhps [dstq+dsq*1], xmm1
+ vpmovuswb xm1, ym1
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
@@ -1588,13 +1588,13 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .h_w4_loop
RET
.h_w8:
- movu xmm0, [srcq+ssq*0]
- vinserti32x4 ym0, ymm0, [srcq+ssq*1], 1
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
- vpmovuswb xmm0, ym0
- movq [dstq+dsq*0], xmm0
- movhps [dstq+dsq*1], xmm0
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
@@ -3308,17 +3308,17 @@ ALIGN function_align
cmp hd, 8
jg .w4_h16
WRAP_YMM %1 0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq ], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_ret
lea dstq, [dstq+strideq*4]
pextrd [dstq ], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_ret:
RET
.w4_h16:
@@ -3332,29 +3332,29 @@ ALIGN function_align
cmp hd, 4
jne .w8_h8
WRAP_YMM %1 0
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq ], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
%1_INC_PTR 2
lea dstq, [dstq+strideq*4]
.w8_h8:
%1 0
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq ], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq ], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
@@ -3415,8 +3415,8 @@ ALIGN function_align
paddw m0, [tmp2q+(%1+0)*mmsize]
mova m1, [tmp1q+(%1+1)*mmsize]
paddw m1, [tmp2q+(%1+1)*mmsize]
- pmulhrsw m0, m2
- pmulhrsw m1, m2
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
packuswb m0, m1
%endmacro
@@ -3425,13 +3425,13 @@ ALIGN function_align
add tmp2q, %1*mmsize
%endmacro
-cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-avg_avx512icl_table
lea r6, [avg_avx512icl_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, dword [r6+wq*4]
- vpbroadcastd m2, [base+pw_1024]
+ vpbroadcastd m4, [base+pw_1024]
add wq, r6
BIDIR_FN AVG
@@ -3573,17 +3573,17 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vinserti128 ym5, [wm_420_perm4+32], 1
vpermb ym4, ym5, ym4
vpdpbusd ym8, ym4, ym9
- vextracti128 xmm1, m0, 1
+ vextracti32x4 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_end:
vpermb ym8, ym10, ym8
movq [maskq], xm8
@@ -3609,11 +3609,11 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpdpbusd ym8, ym4, ym9
vpermb m8, m10, m8
mova [maskq], xm8
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
add tmp1q, 128
@@ -3627,18 +3627,18 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpdpbusd m1, m4, m9
vpermb m1, m10, m1
mova [maskq], xm1
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
@@ -3766,17 +3766,17 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
movhps xm10, [wm_422_mask+16]
vpdpwssd ym8, ym4, ym9
vpermb ym8, ym10, ym8
- vextracti128 xmm1, m0, 1
+ vextracti32x4 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_end:
pand xm8, xm11
mova [maskq], xm8
@@ -3801,11 +3801,11 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermb ym8, ym10, ym8
pand xm8, xm11
mova [maskq], xm8
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
add tmp1q, 128
@@ -3819,18 +3819,18 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermb m1, m10, m1
pand ym1, ym11
mova [maskq], ym1
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
@@ -3936,17 +3936,17 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
vinserti128 ym8, [wm_444_mask+32], 1
vpermb ym4, ym8, ym4
mova [maskq], ym4
- vextracti128 xmm1, m0, 1
+ vextracti32x4 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_end:
RET
.w4_h16:
@@ -3965,11 +3965,11 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
vinserti128 ym8, [wm_444_mask+32], 1
vpermb ym4, ym8, ym4
mova [maskq], ym4
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
add tmp1q, 128
@@ -3980,18 +3980,18 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
W_MASK 0, 4, 0, 1, 1
vpermb m4, m8, m4
mova [maskq], m4
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac.h b/chromium/third_party/dav1d/libdav1d/src/x86/msac.h
index e11cd08c8a4..0bb632fb314 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/msac.h
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/msac.h
@@ -28,21 +28,21 @@
#ifndef DAV1D_SRC_X86_MSAC_H
#define DAV1D_SRC_X86_MSAC_H
+#include "src/cpu.h"
+
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
-/* Needed for checkasm */
-unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
- size_t n_symbols);
-
#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
@@ -55,10 +55,21 @@ unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
#if ARCH_X86_64
#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
+
+static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+ }
+
+ if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+ }
+}
+
#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
#endif
-void dav1d_msac_init_x86(MsacContext *const s);
-
#endif /* DAV1D_SRC_X86_MSAC_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c b/chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h
index e3575ba4da7..de4124c436e 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h
@@ -32,7 +32,7 @@ decl_splat_mv_fn(dav1d_splat_mv_sse2);
decl_splat_mv_fn(dav1d_splat_mv_avx2);
decl_splat_mv_fn(dav1d_splat_mv_avx512icl);
-COLD void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
+static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;