diff options
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm')
-rw-r--r-- | chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm | 1482 |
1 files changed, 974 insertions, 508 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm index c580944c7bb..811f711540f 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm @@ -30,7 +30,6 @@ %if ARCH_X86_64 SECTION_RODATA 32 -pd_1321_2482: dd 1321, 1321, 1321, 1321, 2482, 2482, 2482, 2482 itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 @@ -39,14 +38,17 @@ iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 -iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 -%macro COEF_PAIR 2 +%macro COEF_PAIR 2-3 0 pd_%1_%2: dd %1, %1, %2, %2 %define pd_%1 (pd_%1_%2 + 4*0) %define pd_%2 (pd_%1_%2 + 4*2) +%if %3 +dd -%2, -%2 +%define pd_%2_m%2 pd_%2 +%endif %endmacro COEF_PAIR 201, 995 @@ -56,8 +58,8 @@ COEF_PAIR 1380, 601 COEF_PAIR 1751, 2440 COEF_PAIR 2598, 1189 COEF_PAIR 2751, 2106 -COEF_PAIR 2896, 1567 -COEF_PAIR 2896, 3784 +COEF_PAIR 2896, 1567, 1 +COEF_PAIR 2896, 3784, 1 COEF_PAIR 3035, 3513 COEF_PAIR 3166, 3920 COEF_PAIR 3703, 3290 @@ -66,9 +68,6 @@ COEF_PAIR 4017, 2276 COEF_PAIR 4076, 3612 COEF_PAIR 4091, 3973 -%define pd_1321 (pd_1321_2482 + 4*0) -%define pd_2482 (pd_1321_2482 + 4*4) - pd_8: dd 8 pd_m601: dd -601 pd_m1189: dd -1189 @@ -77,17 +76,23 @@ pd_m2106: dd -2106 pd_m2598: dd -2598 pd_m2751: dd -2751 pd_m3344: dd -3344 +pd_1024: dd 1024 +pd_1321: dd 1321 +pd_1448: dd 1448 +pd_1697: dd 1697 +pd_2482: dd 2482 +pd_3072: dd 3072 ; 1024 + 2048 pd_3803: dd 3803 +pd_5119: dd 5119 ; 1024 + 4096 - 1 +pd_5120: dd 5120 ; 1024 + 4096 pd_5793: dd 5793 pd_6144: dd 6144 ; 2048 + 4096 -pd_10239: dd 10239 ; 2048 + 8192 - 1 -pd_10240: dd 10240 ; 2048 + 8192 -pd_11586: dd 11586 ; 5793 * 2 -pd_34816: dd 34816 ; 2048 + 32768 -pd_38912: dd 38912 ; 2048 + 4096 + 32768 +pd_17408: dd 17408 ; 1024 + 16384 pixel_10bpc_max: times 2 dw 0x03ff pixel_12bpc_max: times 2 dw 0x0fff +dconly_10bpc: times 2 dw 0x7c00 +dconly_12bpc: times 2 dw 0x7000 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff clip_20b_min: dd -0x80000 @@ -214,7 +219,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 -; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2 +; flags: 1 = packed, 2 = inv_dst2 ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags %if %8 < 32 @@ -241,7 +246,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax pmulld m%1, m%5 pmulld m%2, m%5 %endif -%if %9 & 4 +%if %9 & 2 psubd m%4, m%6, m%4 psubd m%2, m%4, m%2 %else @@ -250,17 +255,10 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax %endif paddd m%2, m%4 %endif -%if %9 & 2 ; invert the upper half of dst1 before rounding - vbroadcasti128 m%4, [pw_2048_m2048] - psubd m%1, m%3 - psignd m%1, m%4 - paddd m%1, m%6 -%else %ifnum %6 paddd m%1, m%6 %endif psubd m%1, m%3 -%endif %ifnum %6 psrad m%2, 12 psrad m%1, 12 @@ -287,37 +285,39 @@ ALIGN function_align %endif %endmacro -%macro INV_TXFM_4X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x4 -%ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 - movd xm1, [pw_2896x8] - mov [cq], eobd ; 0 - add r6d, 2048 - sar r6d, 12 - movd xm0, r6d - packssdw xm0, xm0 - pmulhrsw xm0, xm1 - vpbroadcastw xm0, xm0 - mova xm1, xm0 - jmp m(iadst_4x4_internal_10bpc).end -%endif -%endmacro - -%macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x4, 12 +%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x4, %3 %ifidn %1_%2, dct_dct + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 + or r3d, 4 +.dconly2: add r6d, 128 sar r6d, 8 +.dconly3: imul r6d, 181 - add r6d, 128 - sar r6d, 8 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d - vpbroadcastd m0, xm0 - mova m1, m0 - jmp m(iadst_4x4_internal_12bpc).end + paddsw xm0, xm2 + vpbroadcastw xm0, xm0 +.dconly_loop: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + paddsw xm1, xm0 + psubusw xm1, xm2 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + WRAP_XMM RET +%else + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly +%endif %endif %endmacro @@ -399,12 +399,50 @@ INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity +%macro IADST4_1D 0 + vpbroadcastd m5, [pd_1321] + vpbroadcastd m7, [pd_2482] + pmulld m4, m0, m5 ; 1321*in0 + pmulld m6, m3, m7 ; 2482*in3 + paddd m4, m6 ; 1321*in0 + 2482*in3 + pmulld m6, m0, m7 ; 2482*in0 + paddd m0, m3 ; in0 + in3 + paddd m7, m5 ; pd_3803 + pmulld m5, m2 ; 1321*in2 + pmulld m3, m7 ; 3803*in3 + pmulld m7, m2 ; 3803*in2 + psubd m2, m0 ; in2 - in0 - in3 + vpbroadcastd m0, [pd_m3344] + pmulld m1, m0 ; -t3 + pmulld m2, m0 ; out2 (unrounded) + psubd m6, m5 ; 2482*in0 - 1321*in2 + paddd m4, m7 ; t0 + psubd m6, m3 ; t1 + paddd m3, m4, m6 + psubd m4, m1 ; out0 (unrounded) + psubd m6, m1 ; out1 (unrounded) + paddd m3, m1 ; out3 (unrounded) +%endmacro + cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call .main + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass1_end: + vpbroadcastd m5, [pd_2048] + mova m2, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 packssdw m0, m1 - vpermd m0, m4, m0 - psrld m4, 4 - pshufb m0, m4 + vpermd m0, m2, m0 + psrld m2, 4 + pshufb m0, m2 +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif jmp tx2q .pass2: lea r6, [deint_shuf+128] @@ -436,35 +474,16 @@ cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 RET ALIGN function_align .main: - mova m2, [cq+16*2] - vbroadcasti128 m5, [cq+16*0] + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] +%if WIN64 + movaps [rsp+16], xmm6 + movaps [rsp+32], xmm7 +%endif .main2: - mova m0, [pd_1321_2482] - vpbroadcastd m3, [pd_3803] - vpbroadcastd m1, [pd_m3344] - pmulld m4, m0, m2 - pmulld m3, m2 - pmulld m0, m5 - vpbroadcastd m5, [pd_2048] - psubd xm2, [cq+16*3] - psubd m2, [cq+16*0] - pmulld m2, m1 ; t2 t3 - vpermq m4, m4, q1032 - paddd m4, m3 - psubd m0, m4 - paddd xm4, xm4 - paddd m4, m0 ; t0 t1 - vinserti128 m3, m2, xm4, 1 ; t2 t0 - paddd m0, m4, m5 - psubd xm4, xm2 - psubd m1, m0, m2 - vpermq m2, m2, q3232 ; t3 t3 - psubd m1, m4 - mova m4, [itx4_shuf] - paddd m0, m2 ; out0 out1 - paddd m1, m3 ; out2 out3 - psrad m0, 12 - psrad m1, 12 + WRAP_XMM IADST4_1D ret INV_TXFM_4X4_FN flipadst, dct @@ -474,12 +493,9 @@ INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main - packssdw m0, m1 - psrld m1, m4, 8 - vpermd m0, m1, m0 - psrld m4, 4 - pshufb m0, m4 - jmp tx2q + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_10bpc).pass1_end .pass2: lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 @@ -556,19 +572,20 @@ cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 movhps [r6 +strideq*1], xm1 RET -INV_TXFM_4X4_12BPC_FN dct, dct -INV_TXFM_4X4_12BPC_FN dct, identity -INV_TXFM_4X4_12BPC_FN dct, adst -INV_TXFM_4X4_12BPC_FN dct, flipadst +INV_TXFM_4X4_FN dct, dct, 12 +INV_TXFM_4X4_FN dct, identity, 12 +INV_TXFM_4X4_FN dct, adst, 12 +INV_TXFM_4X4_FN dct, flipadst, 12 -cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(idct_4x4_internal_10bpc).main mova m3, [idct4_12_shuf] mova m4, [idct4_12_shuf2] - vpermd m2, m3, m0 - vpermd m1, m4, m1 - jmp m(iadst_4x4_internal_12bpc).pass1_end + vpermd m2, m4, m1 + vpermd m1, m3, m0 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: + vpbroadcastd m5, [pd_2048] vpermq m0, m0, q3120 vpermq m1, m1, q3120 call m(idct_4x4_internal_10bpc).main2 @@ -576,33 +593,52 @@ cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 vpermq m1, m1, q2031 jmp m(iadst_4x4_internal_12bpc).end -INV_TXFM_4X4_12BPC_FN adst, dct -INV_TXFM_4X4_12BPC_FN adst, adst -INV_TXFM_4X4_12BPC_FN adst, flipadst -INV_TXFM_4X4_12BPC_FN adst, identity +INV_TXFM_4X4_FN adst, dct, 12 +INV_TXFM_4X4_FN adst, adst, 12 +INV_TXFM_4X4_FN adst, flipadst, 12 +INV_TXFM_4X4_FN adst, identity, 12 -cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main - vpermd m2, m4, m0 - vpermd m1, m4, m1 + vinserti128 m1, m4, xm6, 1 + vinserti128 m2, xm3, 1 .pass1_end: - punpcklqdq m0, m2, m1 - punpckhqdq m1, m2, m1 + mova m3, [itx4_shuf] + vpbroadcastd m5, [pd_1024] + psrad m1, 1 + psrad m2, 1 + vpermd m1, m3, m1 + vpermd m2, m3, m2 + paddd m1, m5 + paddd m2, m5 + psrad m1, 11 + psrad m2, 11 .pass1_end2: vpbroadcastd m3, [clip_18b_min] vpbroadcastd m4, [clip_18b_max] + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 pmaxsd m0, m3 pmaxsd m1, m3 pminsd m0, m4 pminsd m1, m4 jmp tx2q .pass2: - mova [cq+16*0], m0 - vextracti128 [cq+16*3], m1, 1 - mova m2, m1 - vpermq m5, m0, q1010 - call m(iadst_4x4_internal_10bpc).main2 + call .main_pass2 + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass2_end: + vpbroadcastd m5, [pd_2048] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 .end: +%if WIN64 + WIN64_RESTORE_XMM_INTERNAL + %assign xmm_regs_used 6 +%endif +.end2: vpbroadcastd m4, [pw_16384] movq xm2, [dstq+strideq*0] movq xm3, [dstq+strideq*1] @@ -627,53 +663,53 @@ cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 movhps [r6 +strideq*0], xm0 movhps [r6 +strideq*1], xm1 RET +.main_pass2: + vextracti128 xm3, m1, 1 + mova xm2, xm1 + vextracti128 xm1, m0, 1 + jmp m(iadst_4x4_internal_10bpc).main2 -INV_TXFM_4X4_12BPC_FN flipadst, dct -INV_TXFM_4X4_12BPC_FN flipadst, adst -INV_TXFM_4X4_12BPC_FN flipadst, flipadst -INV_TXFM_4X4_12BPC_FN flipadst, identity +INV_TXFM_4X4_FN flipadst, dct, 12 +INV_TXFM_4X4_FN flipadst, adst, 12 +INV_TXFM_4X4_FN flipadst, flipadst, 12 +INV_TXFM_4X4_FN flipadst, identity, 12 -cglobal iflipadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main - psrld m4, 8 - vpermd m2, m4, m0 - vpermd m1, m4, m1 - punpckhqdq m0, m1, m2 - punpcklqdq m1, m2 - jmp m(iadst_4x4_internal_12bpc).pass1_end2 + vinserti128 m1, m3, xm2, 1 + vinserti128 m2, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass1_end .pass2: - mova [cq+16*0], m0 - vextracti128 [cq+16*3], m1, 1 - mova m2, m1 - vpermq m5, m0, q1010 - call m(iadst_4x4_internal_10bpc).main2 - vpermq m2, m0, q1032 - vpermq m0, m1, q1032 - mova m1, m2 - jmp m(iadst_4x4_internal_12bpc).end - -INV_TXFM_4X4_12BPC_FN identity, dct -INV_TXFM_4X4_12BPC_FN identity, adst -INV_TXFM_4X4_12BPC_FN identity, flipadst -INV_TXFM_4X4_12BPC_FN identity, identity - -cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 - vpbroadcastd m1, [pd_5793] - pmulld m0, m1, [cq+32*0] - pmulld m1, [cq+32*1] + call m(iadst_4x4_internal_12bpc).main_pass2 + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass2_end + +INV_TXFM_4X4_FN identity, dct, 12 +INV_TXFM_4X4_FN identity, adst, 12 +INV_TXFM_4X4_FN identity, flipadst, 12 +INV_TXFM_4X4_FN identity, identity, 12 + +cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + mova m2, [itx4_shuf] + vpbroadcastd m3, [pd_1697] + vpermd m0, m2, [cq+32*0] + vpermd m2, m2, [cq+32*1] vpbroadcastd m5, [pd_2048] - mova m3, [itx4_shuf] - paddd m0, m5 + pmulld m1, m3, m0 + pmulld m3, m2 paddd m1, m5 - psrad m0, 12 + paddd m3, m5 psrad m1, 12 - vpermd m2, m3, m0 - vpermd m1, m3, m1 - jmp m(iadst_4x4_internal_12bpc).pass1_end + psrad m3, 12 + paddd m1, m0 + paddd m2, m3 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: ; m0 = in0 in1 ; m1 = in2 in3 vpbroadcastd m3, [pd_5793] + vpbroadcastd m5, [pd_2048] pmulld m0, m3 pmulld m1, m3 paddd m0, m5 ; 2048 @@ -685,34 +721,19 @@ cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 %macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x8, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 8 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 2048 - sar r6d, 12 -.end: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 - movd xm0, r6d - vpbroadcastw xm0, xm0 - vpbroadcastd xm3, [pixel_%3bpc_max] - pxor xm2, xm2 -.end_loop: - movq xm1, [dstq+strideq*0] - movhps xm1, [dstq+strideq*1] - paddw xm1, xm0 - pmaxsw xm1, xm2 - pminsw xm1, xm3 - movq [dstq+strideq*0], xm1 - movhps [dstq+strideq*1], xm1 - lea dstq, [dstq+strideq*2] - sub r3d, 2 - jg .end_loop - WRAP_XMM RET + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2 +%else + jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly +%endif %endif %endmacro @@ -797,12 +818,14 @@ INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 -.pass1: call m(iadst_8x4_internal_10bpc).main - psrad m0, m4, 12 - psrad m1, m5, 12 - psrad m2, 12 - psrad m3, 12 + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass1_end: + REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: call .pass2_main @@ -918,13 +941,13 @@ INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 -.pass1: call m(iadst_8x4_internal_10bpc).main - psrad m0, m3, 12 - psrad m1, m2, 12 - psrad m2, m5, 12 - psrad m3, m4, 12 - jmp tx2q + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m2, m5, m6 + paddd m3, m5, m4 + jmp m(iadst_4x8_internal_10bpc).pass1_end .pass2: call m(iadst_4x8_internal_10bpc).pass2_main mova xm4, [pw_2048_m2048] @@ -1070,7 +1093,16 @@ INV_TXFM_4X8_FN adst, flipadst, 12 INV_TXFM_4X8_FN adst, identity, 12 cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 - jmp m(iadst_4x8_internal_10bpc).pass1 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m4, 1 + psrad m1, m6, 1 + psrad m2, 1 + psrad m3, 1 +.pass1_end: + vpbroadcastd m5, [pd_1024] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 11}, m0, m1, m2, m3 + jmp tx2q .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] @@ -1146,7 +1178,12 @@ INV_TXFM_4X8_FN flipadst, flipadst, 12 INV_TXFM_4X8_FN flipadst, identity, 12 cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 - jmp m(iflipadst_4x8_internal_10bpc).pass1 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m3, 1 + psrad m1, m2, 1 + psrad m2, m6, 1 + psrad m3, m4, 1 + jmp m(iadst_4x8_internal_12bpc).pass1_end .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] @@ -1180,12 +1217,13 @@ cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 %macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x16, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd xm2, [dconly_%3bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 6144 - sar r6d, 13 - jmp m(inv_txfm_add_dct_dct_4x8_%3bpc).end + or r3d, 16 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 %endif %endmacro @@ -1196,7 +1234,7 @@ INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 .pass1: - vpbroadcastd m10, [pd_6144] + vpbroadcastd m10, [pd_3072] mova m1, [cq+32*2] mova m3, [cq+32*6] mova m5, [cq+32*3] @@ -1241,7 +1279,7 @@ ALIGN function_align vpbroadcastd m4, [pd_3784] vpbroadcastd m8, [pd_1567] vpbroadcastd m9, [pd_2048] - vpbroadcastd m6, [pd_2896] + vpbroadcastd m6, [pd_1448] ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h ret @@ -1253,7 +1291,7 @@ ALIGN function_align psubd m0, m2 paddd m9, m4, m6 psubd m4, m6 - REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h psubd m2, m0, m1 paddd m1, m0 psubd m6, m4, m5 @@ -1304,7 +1342,6 @@ INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 -.pass1: call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_6144] call m(iadst_16x4_internal_10bpc).main_end @@ -1545,7 +1582,6 @@ INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 -.pass1: vpbroadcastd m7, [pd_5793] pmulld m0, m7, [cq+32*0] pmulld m4, m7, [cq+32*1] @@ -1678,7 +1714,16 @@ INV_TXFM_4X16_FN adst, flipadst, 12 INV_TXFM_4X16_FN adst, identity, 12 cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iadst_4x16_internal_10bpc).pass1 + call .main_pass1 + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + psrad m6, 12 + psrad m7, 12 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -1740,6 +1785,22 @@ ALIGN function_align vperm2i128 m4, m8, m9, 0x20 ; 8 10 vperm2i128 m6, m8, m9, 0x31 ; 12 14 ret +ALIGN function_align +.main_pass1: + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_3072] + paddd m10, m4, m5 + psubd m4, m3 + psubd m5, m3 + paddd m3, m10 + psubd m8, m7, m1 + paddd m7, m9 + psubd m9, m1 + paddd m7, m1 + REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 + REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 + paddd m6, m0 + ret INV_TXFM_4X16_FN flipadst, dct, 12 INV_TXFM_4X16_FN flipadst, adst, 12 @@ -1747,7 +1808,16 @@ INV_TXFM_4X16_FN flipadst, flipadst, 12 INV_TXFM_4X16_FN flipadst, identity, 12 cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iflipadst_4x16_internal_10bpc).pass1 + call m(iadst_4x16_internal_12bpc).main_pass1 + psrad m0, m3, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + psrad m3, m4, 12 + psrad m4, m7, 12 + psrad m5, m6, 12 + psrad m6, m9, 12 + psrad m7, m8, 12 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -1772,17 +1842,49 @@ INV_TXFM_4X16_FN identity, flipadst, 12 INV_TXFM_4X16_FN identity, identity, 12 cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iidentity_4x16_internal_10bpc).pass1 + vpbroadcastd m8, [pd_1697] + mova m0, [cq+32*0] + mova m4, [cq+32*1] + mova m1, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m9, [pd_6144] + pmulld m2, m8, m0 + pmulld m6, m8, m4 + pmulld m3, m8, m1 + pmulld m7, m8, m5 + mova m10, [cq+32*4] + mova m11, [cq+32*5] + mova m12, [cq+32*6] + mova m13, [cq+32*7] + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m0, m2 + pmulld m2, m8, m10 + paddd m4, m6 + pmulld m6, m8, m11 + paddd m1, m3 + pmulld m3, m8, m12 + paddd m5, m7 + pmulld m7, m8, m13 + REPX {psrad x, 1 }, m0, m4, m1, m5 + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m2, m10 + paddd m6, m11 + paddd m3, m12 + paddd m7, m13 + REPX {psrad x, 1 }, m2, m6, m3, m7 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 - vpbroadcastd m8, [pd_11586] - vpbroadcastd m9, [pd_2048] + vpbroadcastd m8, [pd_5793] + vpbroadcastd m9, [pd_1024] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 @@ -1795,37 +1897,21 @@ cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x4, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 mov [cq], eobd ; 0 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 - movd xm0, r6d - vpbroadcastw m0, xm0 -.end: - vpbroadcastd m4, [pixel_%3bpc_max] - pxor m3, m3 - mova xm1, [dstq+strideq*0] - vinserti128 m1, [dstq+strideq*1], 1 - lea r6, [dstq+strideq*2] - mova xm2, [r6 +strideq*0] - vinserti128 m2, [r6 +strideq*1], 1 - paddw m1, m0 - paddw m2, m0 - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 - mova [dstq+strideq*0], xm1 - vextracti128 [dstq+strideq*1], m1, 1 - mova [r6 +strideq*0], xm2 - vextracti128 [r6 +strideq*1], m2, 1 - RET + or r3d, 4 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +%else + jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly +%endif %endif %endmacro @@ -1960,32 +2046,7 @@ ALIGN function_align REPX {paddd x, m4}, m0, m3, m2, m1 REPX {psrad x, 12}, m0, m3, m2, m1 .main2: - vbroadcasti128 m6, [pd_1321] - vbroadcasti128 m7, [pd_2482] - pmulld m4, m0, m6 ; 1321*in0 - pmulld m5, m3, m7 ; 2482*in3 - paddd m4, m5 ; 1321*in0 + 2482*in3 - pmulld m5, m0, m7 ; 2482*in0 - paddd m0, m3 ; in0 + in3 - paddd m7, m6 ; pd_3803 - pmulld m6, m2 ; 1321*in2 - pmulld m3, m7 ; 3803*in3 - pmulld m7, m2 ; 3803*in2 - psubd m2, m0 ; in2 - in0 - in3 - vpbroadcastd m0, [pd_m3344] - psubd m5, m6 ; 2482*in0 - 1321*in2 - vpbroadcastd m6, [pd_2048] - psubd m5, m3 ; t1 - pmulld m2, m0 ; t2 - pmulld m1, m0 ; -t3 - paddd m4, m7 ; t0 - paddd m5, m6 - paddd m3, m4, m5 - paddd m4, m6 - psubd m4, m1 ; out0 (unshifted) - psubd m5, m1 ; out1 (unshifted) - paddd m2, m6 ; out2 (unshifted) - paddd m3, m1 ; out3 (unshifted) + IADST4_1D ret INV_TXFM_8X4_FN flipadst, dct @@ -2103,10 +2164,13 @@ cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call .pass2_main - psrad m0, m4, 12 - psrad m1, m5, 12 - psrad m2, 12 - psrad m3, 12 + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass2_end: + REPX {psrad x, 12}, m0, m1, m2, m3 .end: vpbroadcastd m4, [pw_16384] REPX {psrad x, 3}, m0, m1, m2, m3 @@ -2162,11 +2226,12 @@ cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call m(iadst_8x4_internal_12bpc).pass2_main - psrad m0, m3, 12 - psrad m3, m4, 12 - psrad m1, m2, 12 - psrad m2, m5, 12 - jmp m(iadst_8x4_internal_12bpc).end + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m3, m5, m4 + paddd m2, m5, m6 + jmp m(iadst_8x4_internal_12bpc).pass2_end INV_TXFM_8X4_FN identity, dct, 12 INV_TXFM_8X4_FN identity, adst, 12 @@ -2197,32 +2262,36 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 %macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x8, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 - mov [cq], eobd ; 0 - mov r3d, 8 + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 .dconly: - add r6d, 6144 - sar r6d, 13 + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d + paddsw xm0, xm2 vpbroadcastw m0, xm0 - vpbroadcastd m3, [pixel_%3bpc_max] - pxor m2, m2 .dconly_loop: mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 - paddw m1, m0 - pmaxsw m1, m2 - pminsw m1, m3 + paddsw m1, m0 + psubusw m1, m2 mova [dstq+strideq*0], xm1 vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET +%else + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly +%endif %endif %endmacro @@ -2245,7 +2314,7 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a psubd m%10, m%7, m%9 ; t7 paddd m%7, m%9 ; out6 - vpbroadcastd m%9, [pd_2896] + vpbroadcastd m%9, [pd_1448] psubd m%4, m%8, m%6 ; t3 paddd m%8, m%6 ; -out7 psubd m%6, m%1, m%3 ; t2 @@ -2255,10 +2324,10 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 - psubd m%5, m%6, m%4 ; (t2 - t3) * 2896 - paddd m%4, m%6 ; (t2 + t3) * 2896 - psubd m%6, m%3, m%10 ; (t6 - t7) * 2896 - paddd m%3, m%10 ; (t6 + t7) * 2896 + psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 + paddd m%4, m%6 ; (t2 + t3) * 1448 + psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 + paddd m%3, m%10 ; (t6 + t7) * 1448 %endmacro INV_TXFM_8X8_FN dct, dct @@ -2430,8 +2499,8 @@ ALIGN function_align vpbroadcastd m11, [pd_2048] .main2: IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 - psrld m8, 11 ; pd_1 - vpbroadcastd m9, [pd_6144] + psrld m8, 10 ; pd_1 + vpbroadcastd m9, [pd_3072] ret ALIGN function_align .main_end: @@ -2440,14 +2509,14 @@ ALIGN function_align paddd m6, m8 psubd m7, m8, m7 REPX {psrad x, 1 }, m0, m1, m6, m7 - ; (1 + ((x + 2048) >> 12)) >> 1 = (6144 + x) >> 13 - ; (1 - ((x + 2048) >> 12)) >> 1 = (6143 - x) >> 13 - psubd m8, m9, m8 ; pd_6143 + ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 + ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 + psubd m8, m9, m8 ; pd_3071 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 - REPX {psrad x, 13}, m2, m3, m4, m5 + REPX {psrad x, 12}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct @@ -2496,10 +2565,10 @@ ALIGN function_align paddd m5, m9, m2 psubd m2, m8, m3 paddd m3, m9, m4 - psrad m4, m2, 13 - psrad m2, m10, 13 - psrad m3, 13 - psrad m5, 13 + psrad m4, m2, 12 + psrad m2, m10, 12 + psrad m3, 12 + psrad m5, 12 ret INV_TXFM_8X8_FN identity, dct @@ -2681,13 +2750,13 @@ ALIGN function_align paddd m6, m9 psubd m7, m9, m7 REPX {psrad x, 4}, m0, m1, m6, m7 - vpbroadcastd m9, [pd_34816] - psubd m8, m9, m8 ; 34815 + vpbroadcastd m9, [pd_17408] + psubd m8, m9, m8 ; 17407 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 - REPX {psrad x, 16}, m2, m3, m4, m5 + REPX {psrad x, 15}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct, 12 @@ -2729,13 +2798,14 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 8x16, %4 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_%4bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_8x8_%4bpc).dconly + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 %endif %endmacro @@ -2904,7 +2974,7 @@ ALIGN function_align vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 - ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2 psubd m3, m1, m4 ; t10 paddd m1, m4 ; t9 psubd m4, m0, m2 ; t11a @@ -3269,7 +3339,7 @@ cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_end: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 jmp m(idct_8x16_internal_12bpc).end ALIGN function_align .pass2_main: @@ -3302,9 +3372,9 @@ ALIGN function_align pmaxsd m7, m13, [cq+32* 3] ; 3 REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_16x8_internal_10bpc).main_part2 - vpbroadcastd m14, [pd_34816] + vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 - psubd m13, m14, m15 ; pd_34815 + psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret @@ -3357,49 +3427,52 @@ ALIGN function_align m8, m9, m10, m11, m12, m13, m14 pminsd m15, [cq] mova [cq], m7 - vpbroadcastd m7, [pd_11586] + vpbroadcastd m7, [pd_5793] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 pmulld m7, [cq] mova [cq], m15 - vpbroadcastd m15, [pd_2048] + vpbroadcastd m15, [pd_1024] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [cq] - REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ret %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x4, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 - mov [cq], eobd ; 0 - mov r3d, 4 + vpbroadcastd m3, [dconly_%3bpc] +%if %3 = 10 .dconly: - add r6d, 6144 - sar r6d, 13 + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d + paddsw xm0, xm3 vpbroadcastw m0, xm0 - vpbroadcastd m4, [pixel_%3bpc_max] - pxor m3, m3 .dconly_loop: - paddw m1, m0, [dstq+strideq*0] - paddw m2, m0, [dstq+strideq*1] - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 + paddsw m1, m0, [dstq+strideq*0] + paddsw m2, m0, [dstq+strideq*1] + psubusw m1, m3 + psubusw m2, m3 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET +%else + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly +%endif %endif %endmacro @@ -3480,13 +3553,30 @@ ALIGN function_align .pass1_main2: ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 - psubd m4, m10, m5 ; t9 -t10 + vbroadcasti128 m12, [pd_3784_m3784] + psubd m4, m10, m5 paddd m10, m5 ; t8 t11 - psubd m5, m11, m6 ; t14 -t13 + psignd m4, m12 ; t9 t10 + psubd m5, m11, m6 paddd m11, m6 ; t15 t12 - REPX {pmaxsd x, m8}, m4, m5, m10, m11 - REPX {pminsd x, m9}, m4, m5, m10, m11 - ITX_MULSUB_2D 5, 4, 6, 12, 13, 7, 1567, 3784, 2 + psignd m5, m12 ; t14 t13 + vpbroadcastd m6, [pd_1567] + vpbroadcastd m13, [pd_3784] + REPX {pmaxsd x, m8}, m5, m4 + REPX {pminsd x, m9}, m5, m4 + pmulld m12, m5 + pmulld m5, m6 + vbroadcasti128 m6, [pd_1567_m1567] + pmulld m13, m4 + pmulld m4, m6 + REPX {pmaxsd x, m8}, m10, m11, m0, m1 + REPX {pminsd x, m9}, m10, m11, m0, m1 + paddd m12, m7 + paddd m5, m7 + paddd m4, m12 + psubd m5, m13 + psrad m4, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a vpbroadcastd m12, [pd_2896] punpckhqdq m6, m11, m5 punpcklqdq m11, m4 @@ -3500,8 +3590,8 @@ ALIGN function_align REPX {pminsd x, m9}, m5, m6 pmulld m5, m12 pmulld m6, m12 - REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10 - REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10 + REPX {pmaxsd x, m8}, m2, m3, m11, m10 + REPX {pminsd x, m9}, m2, m3, m11, m10 ret ALIGN function_align .pass1_main3: @@ -3565,10 +3655,10 @@ cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp m(idct_16x4_internal_10bpc).end ALIGN function_align .main: - vbroadcasti128 m6, [pd_1321] + vpbroadcastd m6, [pd_1321] mova m0, [cq+32*0] mova m1, [cq+32*1] - vbroadcasti128 m7, [pd_2482] + vpbroadcastd m7, [pd_2482] mova m2, [cq+32*6] mova m3, [cq+32*7] pmulld m4, m0, m6 @@ -3663,8 +3753,7 @@ INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 -.pass1: - vpbroadcastd m8, [pd_11586] + vpbroadcastd m8, [pd_5793] vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m1, [cq+32*1], q3120 ; 2 3 vpermq m2, [cq+32*2], q3120 ; 4 5 @@ -3673,10 +3762,10 @@ cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpermq m5, [cq+32*5], q3120 ; a b vpermq m6, [cq+32*6], q3120 ; c d vpermq m7, [cq+32*7], q3120 ; e f - vpbroadcastd m9, [pd_6144] + vpbroadcastd m9, [pd_3072] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call m(idct_16x4_internal_10bpc).transpose_4x16_packed @@ -3729,17 +3818,15 @@ cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 pmulld m2, m6, m11 pmulld m4, m6, m12 pmulld m6, m13 - vpbroadcastd m10, [pd_2048] + vpbroadcastd m10, [pd_17408] call m(idct_4x16_internal_10bpc).pass1_main2 - REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 - vpbroadcastd m4, [pw_16384] vpbroadcastd m5, [pixel_12bpc_max] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 - REPX {pmulhrsw x, m4}, m0, m1, m2, m3 jmp m(idct_16x4_internal_10bpc).end2 INV_TXFM_16X4_FN adst, dct, 12 @@ -3824,7 +3911,37 @@ INV_TXFM_16X4_FN identity, flipadst, 12 INV_TXFM_16X4_FN identity, identity, 12 cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iidentity_16x4_internal_10bpc).pass1 + vpbroadcastd m8, [pd_1697] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpbroadcastd m9, [pd_3072] + pmulld m4, m8, m0 + pmulld m5, m8, m1 + pmulld m6, m8, m2 + pmulld m7, m8, m3 + vpermq m10, [cq+32*4], q3120 ; 8 9 + vpermq m11, [cq+32*5], q3120 ; a b + vpermq m12, [cq+32*6], q3120 ; c d + vpermq m13, [cq+32*7], q3120 ; e f + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m0, m4 + pmulld m4, m8, m10 + paddd m1, m5 + pmulld m5, m8, m11 + paddd m2, m6 + pmulld m6, m8, m12 + paddd m3, m7 + pmulld m7, m8, m13 + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m4, m10 + paddd m5, m11 + paddd m6, m12 + paddd m7, m13 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -3844,13 +3961,14 @@ cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x8, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%3bpc] mov [cq], eobd ; 0 - mov r3d, 8 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_16x4_%3bpc).dconly + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 %endif %endmacro @@ -4013,13 +4131,13 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 .pass1: lea r6, [rsp+32*4] call .main - vpbroadcastd m14, [pd_6144] + vpbroadcastd m14, [pd_3072] psrld m15, 11 ; pd_1 - psubd m13, m14, m15 ; pd_6143 + psubd m13, m14, m15 ; pd_3071 call .pass1_rotations .pass1_end: REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 jmp tx2q .pass2: call m(idct_16x8_internal_10bpc).transpose @@ -4127,8 +4245,6 @@ ALIGN function_align pmaxsd m10, m13 pminsd m9, m14 pminsd m10, m14 - pmulld m9, m15 - pmulld m10, m15 mova [r6-32*4], m1 mova m11, [r6-32*1] ; t7a mova m1, [r6-32*2] ; t6a @@ -4140,7 +4256,6 @@ ALIGN function_align pmaxsd m2, m13 pminsd m8, m14 pminsd m2, m14 - pmulld m8, m15 mova [r6-32*1], m11 mova [r6-32*3], m2 mova m1, [r6+32*3] ; t15 @@ -4153,8 +4268,6 @@ ALIGN function_align pmaxsd m11, m13 pminsd m7, m14 pminsd m11, m14 - pmulld m7, m15 - pmulld m11, m15 mova [r6-32*2], m12 pminsd m1, m14, [r6+32*0] ; t10a pminsd m12, m14, [r6+32*1] ; t11a @@ -4162,13 +4275,13 @@ ALIGN function_align paddd m1, m4 ; -out1 psubd m4, m5, m12 ; t11 paddd m5, m12 ; out14 - pmulld m12, m15, [r6-32*3] ; t6 + vpbroadcastd m12, [pd_1448] pmaxsd m6, m13 pmaxsd m4, m13 pminsd m6, m14 pminsd m4, m14 - pmulld m6, m15 - pmulld m4, m15 + REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 + pmulld m12, [r6-32*3] ; t6 mova [r6-32*3], m5 paddd m5, m11, m7 ; -out5 (unshifted) psubd m11, m7 ; out10 (unshifted) @@ -4233,7 +4346,7 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 .pass1: lea r6, [rsp+32*4] call m(iadst_16x8_internal_10bpc).main - vpbroadcastd m14, [pd_6144] + vpbroadcastd m14, [pd_3072] psrld m15, 11 psubd m13, m14, m15 call .pass1_rotations @@ -4313,16 +4426,16 @@ cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 mova [rsp], m15 - vpbroadcastd m15, [pd_11586] + vpbroadcastd m15, [pd_5793] REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 pmulld m15, [rsp] mova [rsp], m7 - vpbroadcastd m7, [pd_6144] + vpbroadcastd m7, [pd_3072] REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [rsp] - REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: @@ -4340,6 +4453,10 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_max] jmp m(idct_16x8_internal_10bpc).pass1 .pass2: + call .pass2_main + RET +ALIGN function_align +.pass2_main: call m(idct_8x16_internal_12bpc).transpose vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -4383,8 +4500,7 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpermq m1, m5, q3120 vpermq m2, m6, q3120 vpermq m3, m7, q3120 - call m(idct_16x8_internal_10bpc).write_16x4_zero - RET + jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .write_16x4_start: vpbroadcastd m9, [pixel_12bpc_max] @@ -4403,7 +4519,8 @@ cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iadst_16x8_internal_10bpc).pass1 .pass2: call .pass2_main - jmp m(idct_16x8_internal_12bpc).end + call m(idct_16x8_internal_12bpc).end + RET ALIGN function_align .pass2_main: call m(idct_8x16_internal_12bpc).transpose @@ -4483,12 +4600,13 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 %macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 16x16, %4 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%4bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 10240 - sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_16x4_%4bpc).dconly2 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 %endif %endmacro @@ -4756,17 +4874,17 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 add cq, 32 call .main sub cq, 32 - vpbroadcastd m8, [pd_10240] + vpbroadcastd m8, [pd_5120] paddd m4, m8 paddd m6, m8 paddd m9, m8 paddd m11, m8 - vpbroadcastd m8, [pd_10239] + vpbroadcastd m8, [pd_5119] psubd m5, m8, m5 psubd m7, m8, m7 psubd m10, m8, m10 psubd m12, m8, m12 - REPX {psrad x, 14}, m4, m5, m6, m7, m9, m10, m11, m12 + REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 @@ -4797,8 +4915,8 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 .fast: add r6, 32*8 call .main - vpbroadcastd m14, [pd_10240] - vpbroadcastd m13, [pd_10239] + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 paddd m0, m15 psubd m1, m15, m1 @@ -4818,7 +4936,7 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 psubd m15, [r6-32*4] .pass1_end: REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 sub r6, 32*8 jmp tx2q .pass2: @@ -4892,17 +5010,17 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx add cq, 32 call m(iadst_16x16_internal_10bpc).main sub cq, 32 - vpbroadcastd m8, [pd_10240] + vpbroadcastd m8, [pd_5120] paddd m11, m8 paddd m9, m8 paddd m6, m8 paddd m4, m8 - vpbroadcastd m8, [pd_10239] + vpbroadcastd m8, [pd_5119] psubd m12, m8, m12 psubd m10, m8, m10 psubd m7, m8, m7 psubd m5, m8, m5 - REPX {psrad x, 14}, m12, m11, m10, m9, m7, m6, m5, m4 + REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 mova [r6+32*0], m12 mova [r6+32*1], m11 mova [r6+32*2], m10 @@ -4933,8 +5051,8 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx .fast: add r6, 32*8 call m(iadst_16x16_internal_10bpc).main - vpbroadcastd m14, [pd_10240] - vpbroadcastd m13, [pd_10239] + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 psubd m8, m13, m7 paddd m7, m14, m9 @@ -4996,9 +5114,8 @@ INV_TXFM_16X16_FN identity, dct, -92 INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 -.pass1: - vpbroadcastd m15, [pd_11586] - vpbroadcastd m7, [pd_10240] + vpbroadcastd m15, [pd_5793] + vpbroadcastd m7, [pd_5120] lea r6, [rsp+32*4] sub eobd, 36 jl .fast @@ -5010,7 +5127,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx pmulld m3, m15, [cq+r3+32*39] add r6, 32*4 REPX {paddd x, m7}, m0, m1, m2, m3 - REPX {psrad x, 14}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 @@ -5038,7 +5155,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [cq] - REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: @@ -5203,7 +5320,7 @@ cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_part3: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 .end: packssdw m15, m14 packssdw m14, m13, m12 @@ -5320,15 +5437,15 @@ ALIGN function_align REPX {pminsd x, m14}, m1, m3, m4, m6 .pass2_fast2: call m(iadst_16x8_internal_10bpc).main_part2 - vpbroadcastd m14, [pd_34816] + vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 - psubd m13, m14, m15 ; pd_34815 + psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret ALIGN function_align .pass2_part2: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 @@ -5375,8 +5492,73 @@ cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx INV_TXFM_16X16_FN identity, dct, -92, 12 INV_TXFM_16X16_FN identity, identity, 0, 12 +%macro IDTX16_12BPC 1 ; src + pmulld m6, m7, m%1 + paddd m6, m15 + psrad m6, 12 + paddd m6, m%1 + psrad m%1, m6, 1 +%endmacro + cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 - jmp m(iidentity_16x16_internal_10bpc).pass1 + vpbroadcastd m7, [pd_1697] + vpbroadcastd m15, [pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + mova m10, [cq+r3+32*33] + mova m11, [cq+r3+32*35] + mova m12, [cq+r3+32*37] + mova m13, [cq+r3+32*39] + add r6, 32*4 + pmulld m0, m7, m10 + pmulld m1, m7, m11 + pmulld m2, m7, m12 + pmulld m3, m7, m13 + REPX {paddd x, m15}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + paddd m0, m10 + paddd m1, m11 + paddd m2, m12 + paddd m3, m13 + REPX {psrad x, 1 }, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + mova m0, [cq+64* 0] + mova m1, [cq+64* 1] + mova m2, [cq+64* 2] + mova m3, [cq+64* 3] + mova m4, [cq+64* 4] + mova m5, [cq+64* 5] + mova m8, [cq+64* 6] + mova m9, [cq+64* 7] + REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9 + mova [cq+64*0], m8 + mova [cq+64*1], m9 + mova m8, [cq+64* 8] + mova m9, [cq+64* 9] + mova m10, [cq+64*10] + mova m11, [cq+64*11] + mova m12, [cq+64*12] + mova m13, [cq+64*13] + mova m14, [cq+64*14] + REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14 + mova m6, [cq+64*15] + pmulld m7, m6 + paddd m7, m15 + psrad m7, 12 + paddd m7, m6 + mova m6, [cq+64*0] + psrad m15, m7, 1 + mova m7, [cq+64*1] + jmp tx2q .pass2: call m(iidentity_8x16_internal_12bpc).pass2_main call m(idct_16x16_internal_10bpc).transpose_fast @@ -5429,7 +5611,7 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx call m(idct_16x16_internal_12bpc).write_16x16 RET -%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift +%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack mova m%4, [r6+32*(%1-4)] mova m%2, [r5+32*(3-%1)] mova m%5, [r4+32*(%1-4)] @@ -5446,8 +5628,10 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx paddd m%2, m%3, m%5 ; out15 - n psubd m%3, m%5 ; out16 + n REPX {psrad x, %6}, m%1, m%3, m%2, m%4 +%if %7 & 1 packssdw m%1, m%3 ; out0 + n, out16 + n packssdw m%2, m%4 ; out15 - n, out31 - n +%endif %endmacro cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob @@ -5574,14 +5758,15 @@ cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 - add r6d, 10240 - sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 ALIGN function_align -.pass1_main: +.pass1_main_part1: mova m0, [cq+128*0] mova m1, [cq+128*1] mova m2, [cq+128*2] @@ -5590,7 +5775,6 @@ ALIGN function_align mova m5, [cq+128*5] mova m6, [cq+128*6] mova m7, [cq+128*7] - add cq, 32 call m(idct_8x8_internal_10bpc).main psrld m1, m11, 10 ; pd_2 REPX {paddd x, m1}, m0, m6, m5, m3 @@ -5603,6 +5787,11 @@ ALIGN function_align psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret +ALIGN function_align +.pass1_main: + call .pass1_main_part1 + add cq, 32 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 @@ -5665,7 +5854,7 @@ ALIGN function_align vpbroadcastd m15, [pd_4017] vpbroadcastd m10, [pd_799] ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a - ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a psubd m3, m0, m6 ; t19a paddd m0, m6 ; t16a psubd m6, m7, m1 ; t28a @@ -5734,7 +5923,7 @@ ALIGN function_align vpbroadcastd m15, [pd_2276] vpbroadcastd m10, [pd_3406] ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a - ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a psubd m3, m0, m6 ; t27a paddd m0, m6 ; t24a psubd m6, m7, m1 ; t20a @@ -5747,8 +5936,8 @@ ALIGN function_align REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] - ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a - ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20 mova m9, [r6-32*4] ; t16a mova m10, [r6-32*3] ; t17 psubd m2, m9, m7 ; t23 @@ -5881,8 +6070,9 @@ ALIGN function_align ret cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob - vpbroadcastd m5, [pw_5] vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_5] pxor m6, m6 mov r6d, eobd add eobb, 21 @@ -5947,30 +6137,262 @@ ALIGN function_align vextracti128 [dstq+r4 ], m3, 1 ret +cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + mov r4, cq + lea r6, [rsp+32*4] + call .pass1_main + cmp eobd, 43 + jge .eob43 + jmp .pass2_fast +.eob43: + call .pass1_main + cmp eobd, 107 + jge .eob107 +.pass2_fast: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(idct_8x16_internal_10bpc).main_oddhalf_fast + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + jmp .pass2_end +.eob107: + call .pass1_main + cmp eobd, 171 + jge .eob171 + jmp .pass2 +.eob171: + call .pass1_main +.pass2: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + pmaxsd m4, m12, [cq+128*1+64] + pmaxsd m5, m12, [cq+128*7+64] + pmaxsd m6, m12, [cq+128*1+96] + pmaxsd m7, m12, [cq+128*7+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + pmaxsd m4, m12, [cq+128*3+64] + pmaxsd m5, m12, [cq+128*5+64] + pmaxsd m6, m12, [cq+128*3+96] + pmaxsd m7, m12, [cq+128*5+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + pmaxsd m4, m12, [cq+128*2+64] + pmaxsd m5, m12, [cq+128*6+64] + pmaxsd m6, m12, [cq+128*2+96] + pmaxsd m7, m12, [cq+128*6+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + pmaxsd m4, m12, [cq+128*0+64] + pmaxsd m5, m12, [cq+128*4+64] + pmaxsd m6, m12, [cq+128*0+96] + pmaxsd m7, m12, [cq+128*4+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf +.pass2_end: + psrld m11, 8 ; pd_8 + IDCT32_END 0, 15, 8, 9, 10, 4 + IDCT32_END 1, 14, 8, 9, 10, 4 + punpckhqdq m8, m0, m1 ; 16 17 (interleaved) + punpcklqdq m0, m1 ; 0 1 (interleaved) + punpcklqdq m1, m14, m15 ; 14 15 (interleaved) + punpckhqdq m14, m15 ; 30 31 (interleaved) + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 4 + IDCT32_END 3, 14, 8, 9, 10, 4 + punpckhqdq m8, m2, m3 ; 18 19 (interleaved) + punpcklqdq m2, m3 ; 2 3 (interleaved) + punpcklqdq m3, m14, m15 ; 12 13 (interleaved) + punpckhqdq m14, m15 ; 28 29 (interleaved) + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 4 + IDCT32_END 5, 14, 8, 9, 10, 4 + punpckhqdq m8, m4, m5 ; 20 21 (interleaved) + punpcklqdq m4, m5 ; 4 5 (interleaved) + punpcklqdq m5, m14, m15 ; 10 11 (interleaved) + punpckhqdq m14, m15 ; 26 27 (interleaved) + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 4 + IDCT32_END 7, 14, 8, 9, 10, 4 + punpckhqdq m8, m6, m7 ; 22 23 (interleaved) + punpcklqdq m6, m7 ; 6 7 (interleaved) + punpcklqdq m7, m14, m15 ; 8 9 (interleaved) + punpckhqdq m14, m15 ; 24 25 (interleaved) + mova [r5-32*3], m8 + mova [r5-32*4], m14 + mova m15, m1 +.end: + vpermq m0, m0, q3120 + vpermq m1, m2, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m6, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m7, q3120 + vpermq m1, m5, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m3, q3120 + vpermq m1, m15, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*3], q3120 + vpermq m1, [r5+32*1], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*1], q3120 + vpermq m1, [r5-32*3], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*4], q3120 + vpermq m1, [r5-32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*0], q3120 + vpermq m1, [r5+32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +ALIGN function_align +.pass1_main: + call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1 + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 32 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2, 0 + mova [cq+32*16], m8 + mova [cq+32*31], m9 + IDCT32_END 1, 14, 8, 9, 10, 2, 0 + mova [cq+32*17], m8 + mova [cq+32*30], m9 + mova [cq+32*14], m14 + IDCT32_END 2, 14, 8, 9, 10, 2, 0 + mova [cq+32*18], m8 + mova [cq+32*29], m9 + mova [cq+32*13], m14 + IDCT32_END 3, 14, 8, 9, 10, 2, 0 + mova [cq+32*19], m8 + mova [cq+32*28], m9 + mova [cq+32*12], m14 + IDCT32_END 4, 14, 8, 9, 10, 2, 0 + mova [cq+32*20], m8 + mova [cq+32*27], m9 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + IDCT32_END 5, 10, 0, 1, 2, 2, 0 + mova [cq+32*21], m0 + mova [cq+32*26], m1 + IDCT32_END 6, 9, 0, 1, 2, 2, 0 + mova [cq+32*22], m0 + mova [cq+32*25], m1 + IDCT32_END 7, 8, 0, 1, 2, 2, 0 + mova [cq+32*23], m0 + mova [cq+32*24], m1 + mova m0, [cq+32* 0] + mova m1, [cq+32* 1] + mova m2, [cq+32* 2] + mova m11, m14 + mova m12, [cq+32*12] + mova m13, [cq+32*13] + mova m14, [cq+32*14] + ret + +cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1 + cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 8 + or r3d, 8 .dconly: - add r6d, 10240 - sar r6d, 14 + add r6d, 640 + sar r6d, 10 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d + paddsw xm0, xm3 vpbroadcastw m0, xm0 - vpbroadcastd m4, [pixel_10bpc_max] - pxor m3, m3 .dconly_loop: - paddw m1, m0, [dstq+32*0] - paddw m2, m0, [dstq+32*1] - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + psubusw m1, m3 + psubusw m2, m3 mova [dstq+32*0], m1 mova [dstq+32*1], m2 add dstq, strideq @@ -5979,6 +6401,39 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob RET .full: PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + call .pass1 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end + lea r6, [deint_shuf+128] + vpbroadcastd m11, [pw_2048] + mov r4, dstq + call .pass2 + mova m0, [r5+32*3] ; 16 17 + mova m1, [r5+32*2] ; 30 31 + mova m2, [r5+32*1] ; 18 19 + mova m3, [r5+32*0] ; 28 29 + mova m4, [r5-32*1] ; 20 21 + mova m5, [r5-32*2] ; 26 27 + mova m6, [r5-32*3] ; 22 23 + mova m7, [r5-32*4] ; 24 25 + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + lea dstq, [r4+32] + call .pass2 + RET +ALIGN function_align +.pass2: + call m(idct_16x8_internal_8bpc).main + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 + jmp m(idct_16x8_internal_10bpc).write_16x4_zero +ALIGN function_align +.pass1: mova m0, [cq+32* 1] mova m1, [cq+32* 7] mova m2, [cq+32* 9] @@ -5988,10 +6443,7 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob mova m6, [cq+32*25] mova m7, [cq+32*31] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_18b_min] - vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] - lea r6, [rsp+32*4] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 mova m0, [cq+32* 3] mova m1, [cq+32* 5] @@ -6021,37 +6473,12 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob mova m7, [cq+32*28] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf - call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end - lea r6, [deint_shuf+128] - vpbroadcastd m11, [pw_2048] - mov r4, dstq - call .pass2 - mova m0, [r5+32*3] ; 16 17 - mova m1, [r5+32*2] ; 30 31 - mova m2, [r5+32*1] ; 18 19 - mova m3, [r5+32*0] ; 28 29 - mova m4, [r5-32*1] ; 20 21 - mova m5, [r5-32*2] ; 26 27 - mova m6, [r5-32*3] ; 22 23 - mova m7, [r5-32*4] ; 24 25 - call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose - lea dstq, [r4+32] - call .pass2 - RET -ALIGN function_align -.pass2: - call m(idct_16x8_internal_8bpc).main - REPX {pmulhrsw x, m11}, m0, m1, m2, m3 - call m(idct_16x8_internal_10bpc).write_16x4_start - pmulhrsw m0, m11, m4 - pmulhrsw m1, m11, m5 - pmulhrsw m2, m11, m6 - pmulhrsw m3, m11, m7 - jmp m(idct_16x8_internal_10bpc).write_16x4_zero + ret cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob - vpbroadcastd m5, [pw_4096] vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_4096] pxor m6, m6 mov r6d, eobd add eobb, 21 @@ -6078,6 +6505,47 @@ cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob jge .loop RET +cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 8 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1 + call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end + mov r4, dstq + call m(idct_16x8_internal_12bpc).pass2_main + mova m0, [cq+32* 0] ; 16 + mova m1, [cq+32* 1] ; 17 + mova m2, [cq+32* 2] ; 18 + mova m3, [cq+32* 3] ; 19 + mova m4, [cq+32* 4] ; 20 + mova m5, [cq+32* 5] ; 21 + mova m6, [cq+32* 6] ; 22 + mova m7, [cq+32* 7] ; 23 + mova m8, [cq+32* 8] ; 24 + mova m9, [cq+32* 9] ; 25 + mova m10, [cq+32*10] ; 26 + mova m11, [cq+32*11] ; 27 + mova m12, [cq+32*12] ; 28 + mova m13, [cq+32*13] ; 29 + mova m14, [cq+32*14] ; 30 + mova m15, [cq+32*15] ; 31 + lea dstq, [r4+32] + call m(idct_16x8_internal_12bpc).pass2_main + RET + +cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1 + %macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] mova m%4, [%2] paddsw m%3, m%1, m%4 @@ -6121,13 +6589,14 @@ cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 jmp .fast .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 .eob44: mova [r4+16*0], xm0 mova [r4+16*1], xm3 @@ -6472,14 +6941,15 @@ cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] jmp .end .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 6144 - sar r6d, 13 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .full: add cq, 32 @@ -6742,9 +7212,10 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .fast: lea r4, [rsp+32*71] @@ -7019,12 +7490,13 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 64 - add r6d, 10240 - sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 .fast: lea r4, [rsp+32*38] pxor m0, m0 @@ -7246,7 +7718,7 @@ ALIGN function_align REPX {pmaxsd x, m12}, m8, m1, m6, m2 REPX {pminsd x, m13}, m8, m1, m6, m2 ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a - ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a REPX {pmaxsd x, m12}, m0, m3, m7, m4 REPX {pminsd x, m13}, m0, m3, m7, m4 vpbroadcastd m10, [r5+4*10] @@ -7301,7 +7773,7 @@ ALIGN function_align REPX {pmaxsd x, m12}, m8, m1, m3, m4 REPX {pminsd x, m13}, m8, m1, m3, m4 ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a - ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a REPX {pmaxsd x, m12}, m0, m2, m5, m7 REPX {pminsd x, m13}, m0, m5, m2, m7 psubd m6, m2, m7 ; t48a @@ -7358,14 +7830,15 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 64 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 6144 - sar r6d, 13 + or r3d, 64 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .fast: lea r4, [rsp+32*70] @@ -7540,30 +8013,26 @@ ALIGN function_align cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .normal - imul r6d, [cq], 2896 + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 .dconly: - add r6d, 10240 - sar r6d, 14 + add r6d, 640 + sar r6d, 10 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + vpbroadcastd m5, [dconly_10bpc] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d -%if WIN64 - movaps [rsp+8], xmm6 -%endif + paddsw xm0, xm5 vpbroadcastw m0, xm0 - vpbroadcastd m6, [pixel_10bpc_max] - pxor m5, m5 .dconly_loop: - paddw m1, m0, [dstq+32*0] - paddw m2, m0, [dstq+32*1] - paddw m3, m0, [dstq+32*2] - paddw m4, m0, [dstq+32*3] - REPX {pmaxsw x, m5}, m1, m2, m3, m4 - REPX {pminsw x, m6}, m1, m2, m3, m4 + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + paddsw m3, m0, [dstq+32*2] + paddsw m4, m0, [dstq+32*3] + REPX {psubusw x, m5}, m1, m2, m3, m4 mova [dstq+32*0], m1 mova [dstq+32*1], m2 mova [dstq+32*2], m3 @@ -7571,9 +8040,6 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob add dstq, strideq dec r3d jg .dconly_loop -%if WIN64 - movaps xmm6, [rsp+8] -%endif RET .normal: PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob @@ -7814,14 +8280,14 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 32 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 6144 - sar r6d, 13 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 .fast: pxor m0, m0 @@ -7963,9 +8429,9 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly .fast: pxor m0, m0 |