summaryrefslogtreecommitdiff
path: root/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
diff options
context:
space:
mode:
authorDale Curtis <dalecurtis@chromium.org>2022-12-16 22:37:46 +0000
committerMichael BrĂ¼ning <michael.bruning@qt.io>2023-03-27 08:12:03 +0000
commitc885ec409f9b6ffa25e03851729b1bc2ad2005b3 (patch)
tree0c9f205efc231ede87d2704b2780d1569caf5111 /chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
parent0d63fc949d16f3e37ed7ab43d335b9d81cc6fdf7 (diff)
downloadqtwebengine-chromium-c885ec409f9b6ffa25e03851729b1bc2ad2005b3.tar.gz
[Backport] Security bug 1401571102-based
Manual update of libdav1d to match the version introduced by patch https://chromium-review.googlesource.com/c/chromium/src/+/4114163: Roll src/third_party/dav1d/libdav1d/ 87f9a81cd..ed63a7459 (104 commits) This roll required a few changes to get working: - "properties" => "built in options" crossfile configuration change due to Meson deprecation. - generic config creation never worked, so fixed. - PPC64 configs were never checked in, so switched to generic. - copyright header changes for generate_sources. - Updated readme.chromium with potential issues that can arise. https://chromium.googlesource.com/external/github.com/videolan/dav1d.git/+log/87f9a81cd770..ed63a7459376 $ git log 87f9a81cd..ed63a7459 --date=short --no-merges --format='%ad %ae %s' 2022-12-09 jamrial dav1d: add an option to skip decoding some frame types 2022-12-08 jamrial picture: support creating and freeing refs without tile data 2022-12-07 gramner x86: Add 10bpc 8x32/32x8 itx AVX-512 (Ice Lake) asm 2022-12-07 gramner x86: Add minor DC-only IDCT optimizations 2022-12-13 gramner getbits: Fix assertion failure 2022-12-13 gramner checkasm: Fix integer overflow in refmvs test 2022-01-26 gramner dav1dplay: Update to new libplacebo API 2022-12-09 gramner Add minor getbits improvements 2022-12-09 gramner Add a separate getbits function for getting a single bit 2022-12-09 gramner Remove redundant zeroing in sequence header parsing 2022-12-09 gramner Set the correct default value of initial_display_delay 2022-12-09 jamrial tools: remove the null last entry in inloop_filters_tbl 2022-12-04 lu_zero Do not assume the picture allocation starts as the left edge 2022-11-21 lu_zero ppc: Allocate the correct temp buffer size 2022-11-21 lu_zero ppc: Do not use static const with vec_splats 2022-11-02 charlie.c.hayden Add info to dav1d_send_data docs 2022-10-30 jbeich build: drop -D_DARWIN_C_SOURCE on macOS/iOS after 6b611d36acab 2022-10-30 jbeich build: drop -D_POSIX_C_SOURCE on non-Linux after 6b611d36acab 2022-06-28 victorien threading: Add a pending list for async task insertion 2022-10-26 martin Implement atomic_compare_exchange_strong in the atomic compat headers 2022-10-06 victorien threading: Fix a race around frame completion (frame-mt) 2022-10-07 sebastian Handle host_machine.system() 'ios' and 'tvos' the same way as 'darwin' 2022-09-23 gramner x86: Add 10-bit 8x8/8x16/16x8/16x16 itx AVX-512 (Ice Lake) asm 2022-09-30 gramner Specify hidden visibility for global data symbol declarations 2022-09-28 gramner build: strip() the result of cc.get_define() 2022-09-26 gramner checkasm: Move printf format string to .rodata on x86 2022-09-26 gramner checkasm: Improve 32-bit parameter clobbering on x86-64 2022-09-26 gramner x86: Fix incorrect 32-bit parameter usage in high bit-depth AVX-512 mc 2022-09-09 martin arm: itx: Add clipping to row_clip_min/max in the 10 bpc codepaths 2022-09-15 gramner x86: Fix overflows in 12bpc AVX2 IDCT/IADST 2022-09-15 gramner x86: Fix overflows in 12bpc AVX2 DC-only IDCT 2022-09-15 gramner x86: Fix clipping in high bit-depth AVX2 4x16 IDCT 2022-03-21 martin Don't use gas-preprocessor with clang-cl for arm targets 2022-06-07 david_conrad Fix checking the reference dimesions for the projection process 2022-06-07 david_conrad Fix calculation of OBMC lap dimensions 2022-06-07 david_conrad Support film grain application whose only effect is clipping to video range 2022-06-07 david_conrad Ignore T.35 metadata if the OBU contains no payload 2022-06-07 david_conrad Fix chroma deblock filter size calculation for lossless 2022-06-07 david_conrad Fix rounding in the calculation of initialSubpelX 2022-06-07 david_conrad Fix overflow when saturating dequantized coefficients clipped to 0 2022-06-08 david_conrad Fix overflow in 8-bit NEON ADST 2022-09-14 martin tools: Allocate the priv structs with proper alignment 2022-09-08 gramner x86: Fix clipping in 10bpc SSE4.1 IDCT asm 2022-09-08 gramner build: Improve Windows linking options 2022-09-08 gramner tools: Improve demuxer probing 2022-08-30 code CI: Disable trimming on some tests 2022-08-30 code CI: Remove git 'safe.directory' config 2022-08-30 code gcovr: Ignore parsing errors 2022-08-30 code crossfiles: Update Android toolchains 2022-08-30 code CI: Update images (...) 2022-09-01 victorien checkasm: Add short options 2022-09-01 victorien checkasm: Add pattern matching to --test 2022-09-01 victorien checkasm: Remove pattern matching from --bench 2022-08-29 victorien checkasm: Add a --function option 2022-08-30 victorien threading: Fix copy_lpf_progress initialization 2022-08-19 jamrial data: don't overwrite the Dav1dDataProps size value 2022-07-18 gramner Adjust inlining attributes on some functions 2022-07-19 gramner x86: Remove leftover instruction in loopfilter AVX2 asm 2022-06-07 david_conrad Enable pointer authentication in assembly when building arm64e 2022-06-07 david_conrad Don't trash the return stack buffer in the NEON loop filter 2022-07-03 thresh CI: Removed snap package generation 2022-07-06 gramner Eliminate unused C DSP functions at compile time 2022-07-06 gramner cpu: Inline dav1d_get_cpu_flags() 2022-06-22 gramner x86: Add minor loopfilter asm improvements 2022-06-15 gramner checkasm: Speed up signal handling 2022-06-15 gramner checkasm: Improve seed generation on Windows 2022-06-20 gramner ci: Don't specify a specific MacOS version 2022-06-14 gramner x86: Add high bit-depth loopfilter AVX-512 (Ice Lake) asm 2022-06-13 victorien checkasm/lpf: Use operating dimensions 2022-06-03 gramner checkasm: Print the cpu model and cpuid signature on x86 2022-06-03 gramner checkasm: Add a vzeroupper check on x86 2022-06-02 gramner x86: Add a workaround for quirky AVX-512 hardware behavior 2022-05-31 victorien checkasm: Fix uninitialized variable 2022-05-14 code CI: Update coverage collecting 2022-05-05 code CI: Add a build with the minimum requirements 2022-05-05 code CI: Deactivate git 'safe.directory' 2022-03-24 code CI: Update images 2022-05-25 victorien Fix typo 2022-05-19 gramner x86: Add high bit-depth cdef_filter AVX-512 (Ice Lake) asm 2022-05-20 gramner checkasm: Print --help message to stderr instead of stdout 2022-05-20 gramner checkasm: Split cdef test into separate pri/sec/pri+sec parts 2022-05-20 gramner checkasm: Improve benchmarking of functions that modify their input 2022-05-18 b x86/itx_avx2: fix typo 2022-04-22 code CI: Add gcc12 and clang14 builds with mold linker 2022-04-26 code CI: Trigger documentation rebuild if configuration changes 2022-04-24 code meson/doc: Fix doxygen config 2022-04-28 gramner Use a relaxed memory ordering in dav1d_ref_inc() 2022-04-28 gramner Remove redundant code in dav1d_cdf_thread_unref() 2022-04-28 gramner Inline dav1d_ref_inc() 2022-04-24 code x86/itx: Add 32x8 12bpc AVX2 transforms 2022-04-24 code x86/itx: Add 8x32 12bpc AVX2 transforms 2022-04-24 code x86/itx: Deduplicate dconly code 2022-04-23 code lib: Fix typo in documentation 2022-04-07 jamrial obu: don't output invisible but showable key frames more than once 2022-04-07 jamrial obu: check that the frame referenced by existing_frame_idx is showable 2022-04-07 jamrial obu: check refresh_frame_flags is not equal to allFrames on Intra Only frames 2022-03-29 robux4 remove multipass wait from dav1d_decode_frame 2022-04-07 jamrial picture: ensure the new seq header and op param info flags are attached to the next visible picture in display order 2022-03-31 jamrial lib: add a function to query the decoder frame delay 2022-03-31 jamrial lib: split calculating thread count to its own function Created with: roll-dep src/third_party/dav1d/libdav1d Fixed: 1401571 Change-Id: Ic3cef540a87a2cf411abe6071fd4c9963ea61f75 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4114163 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Dale Curtis <dalecurtis@chromium.org> Cr-Commit-Position: refs/heads/main@{#1084574} Reviewed-on: https://codereview.qt-project.org/c/qt/qtwebengine-chromium/+/468619 Reviewed-by: Michal Klocek <michal.klocek@qt.io>
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm')
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm231
1 files changed, 123 insertions, 108 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
index 4fb30ef4e7a..3833e17c99f 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
@@ -361,18 +361,32 @@ ALIGN function_align
%macro INV_TXFM_4X4_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x4
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
- movd m1, [o(pw_2896x8)]
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
+ mov r3d, 4
+.dconly:
+ add r5d, 128
+ sar r5d, 8
+.dconly2:
+ imul r5d, 2896
+ mova m2, [o(pixel_10bpc_max)]
+ add r5d, 34816
movd m0, r5d
- packssdw m0, m0
- pmulhrsw m0, m1
- pshuflw m0, m0, q0000
+ pshuflw m0, m0, q1111
+ pxor m3, m3
punpcklqdq m0, m0
- mova m1, m0
- TAIL_CALL m(iadst_4x4_internal_16bpc).end
+.dconly_loop:
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ paddw m1, m0
+ pminsw m1, m2
+ pmaxsw m1, m3
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
%endif
%endmacro
@@ -662,40 +676,13 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, %3, 4x8
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 2
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 2048
- sar r5d, 12
-.end:
- imul r5d, 2896
- add r5d, 34816
- movd m0, r5d
- pshuflw m0, m0, q1111
- punpcklqdq m0, m0
- pxor m4, m4
- mova m3, [o(pixel_10bpc_max)]
- lea r2, [strideq*3]
-.loop:
- movq m1, [dstq+strideq*0]
- movq m2, [dstq+strideq*2]
- movhps m1, [dstq+strideq*1]
- movhps m2, [dstq+r2]
- paddw m1, m0
- paddw m2, m0
- REPX {pminsw x, m3}, m1, m2
- REPX {pmaxsw x, m4}, m1, m2
- movq [dstq+strideq*0], m1
- movhps [dstq+strideq*1], m1
- movq [dstq+strideq*2], m2
- movhps [dstq+r2 ], m2
- lea dstq, [dstq+strideq*4]
- dec r3d
- jg .loop
- RET
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
%endif
%endmacro
@@ -944,12 +931,12 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 4
- add r5d, 6144
- sar r5d, 13
- jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end
+ mov r3d, 16
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
%endif
%endmacro
@@ -1297,13 +1284,13 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 2048
- sar r5d, 12
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 128
+ sar r5d, 8
imul r5d, 2896
add r5d, 34816
movd m0, r5d
@@ -1783,12 +1770,12 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 2
.end:
- add r5d, 6144
- sar r5d, 13
+ add r5d, 384
+ sar r5d, 9
.end2:
imul r5d, 2896
add r5d, 34816
@@ -2146,11 +2133,11 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
mov r3d, 4
%if stack_size_padded > 0
; adjust to caller's stack allocation
@@ -2477,12 +2464,12 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 4
.dconly:
- add r5d, 6144
- sar r5d, 13
+ add r5d, 384
+ sar r5d, 9
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -2755,6 +2742,8 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
ret
.round:
%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
pcmpeqd m8, m8
REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
@@ -2785,6 +2774,14 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
pcmpeqd m0, m0
REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
mova [r3+ 1*16], m1
@@ -3472,12 +3469,12 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
%if ARCH_X86_32
add rsp, 1*16
%endif
@@ -3939,11 +3936,11 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (5+ARCH_X86_64*3+WIN64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
%endif
@@ -4057,6 +4054,8 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
ret
.round:
%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
psrld m8, m11, 10 ; 2
REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
@@ -4087,6 +4086,14 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
mova m0, [o(pd_2)]
REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
paddd m0, [r3+ 0*16]
@@ -5162,11 +5169,11 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (31+2*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
@@ -5339,12 +5346,12 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
%endif
RET
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
add rsp, (65+4*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
@@ -5944,6 +5951,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
; final sumsub for idct16 as well as idct32, plus final downshift
%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
mova m%4, [r3+16*(23-%1)]
+ pmaxsd m%1, m12
+ pminsd m%1, m13
psubd m%3, m%1, m%4 ; idct16 out15 - n
paddd m%1, m%4 ; idct16 out0 + n
pmaxsd m%1, m12
@@ -6019,6 +6028,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
.loop_dct32_end:
mova m0, [r3+16*16]
mova m6, [r3+16*24]
+ pmaxsd m0, m2
+ pminsd m0, m3
psubd m5, m0, m6 ; idct16 out15 - n
paddd m0, m6 ; idct16 out0 + n
pmaxsd m0, m2
@@ -6045,12 +6056,12 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
.dconly1:
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -6344,14 +6355,14 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
@@ -6565,7 +6576,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
@@ -6838,11 +6849,11 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
@@ -7098,14 +7109,14 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
@@ -7537,6 +7548,8 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
mova m5, [r3-16* 4] ; idct64 48 + n
mova m6, [r4-16*20] ; idct64 47 - n
mova m7, [r3-16*20] ; idct64 32 + n
+ pmaxsd m0, m12
+ pminsd m0, m13
paddd m8, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
REPX {pmaxsd x, m12}, m8, m0
@@ -7565,11 +7578,13 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
mova [r4-16* 4], m6
mova [r3+16*12], m8
%else
+ mova m5, [o(clip_18b_min)]
+ mova m6, [o(clip_18b_max)]
mova m1, [r3+16*44] ; idct16 15 - n
+ pmaxsd m0, m5
+ pminsd m0, m6
paddd m4, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
- mova m5, [o(clip_18b_min)]
- mova m6, [o(clip_18b_max)]
REPX {pmaxsd x, m5}, m4, m0
REPX {pminsd x, m6}, m4, m0
paddd m1, m4, m3 ; idct32 out0 + n
@@ -7632,12 +7647,12 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
.dconly1:
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -7876,14 +7891,14 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
add rsp, (1+8*32+1*WIN64)*16
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
@@ -8112,7 +8127,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \