diff options
author | Dale Curtis <dalecurtis@chromium.org> | 2022-12-16 22:37:46 +0000 |
---|---|---|
committer | Michael BrĂ¼ning <michael.bruning@qt.io> | 2023-03-27 08:12:03 +0000 |
commit | c885ec409f9b6ffa25e03851729b1bc2ad2005b3 (patch) | |
tree | 0c9f205efc231ede87d2704b2780d1569caf5111 /chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm | |
parent | 0d63fc949d16f3e37ed7ab43d335b9d81cc6fdf7 (diff) | |
download | qtwebengine-chromium-102-based.tar.gz |
[Backport] Security bug 1401571102-based
Manual update of libdav1d to match the version introduced by patch
https://chromium-review.googlesource.com/c/chromium/src/+/4114163:
Roll src/third_party/dav1d/libdav1d/ 87f9a81cd..ed63a7459 (104 commits)
This roll required a few changes to get working:
- "properties" => "built in options" crossfile configuration change due to Meson deprecation.
- generic config creation never worked, so fixed.
- PPC64 configs were never checked in, so switched to generic.
- copyright header changes for generate_sources.
- Updated readme.chromium with potential issues that can arise.
https://chromium.googlesource.com/external/github.com/videolan/dav1d.git/+log/87f9a81cd770..ed63a7459376
$ git log 87f9a81cd..ed63a7459 --date=short --no-merges --format='%ad %ae %s'
2022-12-09 jamrial dav1d: add an option to skip decoding some frame types
2022-12-08 jamrial picture: support creating and freeing refs without tile data
2022-12-07 gramner x86: Add 10bpc 8x32/32x8 itx AVX-512 (Ice Lake) asm
2022-12-07 gramner x86: Add minor DC-only IDCT optimizations
2022-12-13 gramner getbits: Fix assertion failure
2022-12-13 gramner checkasm: Fix integer overflow in refmvs test
2022-01-26 gramner dav1dplay: Update to new libplacebo API
2022-12-09 gramner Add minor getbits improvements
2022-12-09 gramner Add a separate getbits function for getting a single bit
2022-12-09 gramner Remove redundant zeroing in sequence header parsing
2022-12-09 gramner Set the correct default value of initial_display_delay
2022-12-09 jamrial tools: remove the null last entry in inloop_filters_tbl
2022-12-04 lu_zero Do not assume the picture allocation starts as the left edge
2022-11-21 lu_zero ppc: Allocate the correct temp buffer size
2022-11-21 lu_zero ppc: Do not use static const with vec_splats
2022-11-02 charlie.c.hayden Add info to dav1d_send_data docs
2022-10-30 jbeich build: drop -D_DARWIN_C_SOURCE on macOS/iOS after 6b611d36acab
2022-10-30 jbeich build: drop -D_POSIX_C_SOURCE on non-Linux after 6b611d36acab
2022-06-28 victorien threading: Add a pending list for async task insertion
2022-10-26 martin Implement atomic_compare_exchange_strong in the atomic compat headers
2022-10-06 victorien threading: Fix a race around frame completion (frame-mt)
2022-10-07 sebastian Handle host_machine.system() 'ios' and 'tvos' the same way as 'darwin'
2022-09-23 gramner x86: Add 10-bit 8x8/8x16/16x8/16x16 itx AVX-512 (Ice Lake) asm
2022-09-30 gramner Specify hidden visibility for global data symbol declarations
2022-09-28 gramner build: strip() the result of cc.get_define()
2022-09-26 gramner checkasm: Move printf format string to .rodata on x86
2022-09-26 gramner checkasm: Improve 32-bit parameter clobbering on x86-64
2022-09-26 gramner x86: Fix incorrect 32-bit parameter usage in high bit-depth AVX-512 mc
2022-09-09 martin arm: itx: Add clipping to row_clip_min/max in the 10 bpc codepaths
2022-09-15 gramner x86: Fix overflows in 12bpc AVX2 IDCT/IADST
2022-09-15 gramner x86: Fix overflows in 12bpc AVX2 DC-only IDCT
2022-09-15 gramner x86: Fix clipping in high bit-depth AVX2 4x16 IDCT
2022-03-21 martin Don't use gas-preprocessor with clang-cl for arm targets
2022-06-07 david_conrad Fix checking the reference dimesions for the projection process
2022-06-07 david_conrad Fix calculation of OBMC lap dimensions
2022-06-07 david_conrad Support film grain application whose only effect is clipping to video range
2022-06-07 david_conrad Ignore T.35 metadata if the OBU contains no payload
2022-06-07 david_conrad Fix chroma deblock filter size calculation for lossless
2022-06-07 david_conrad Fix rounding in the calculation of initialSubpelX
2022-06-07 david_conrad Fix overflow when saturating dequantized coefficients clipped to 0
2022-06-08 david_conrad Fix overflow in 8-bit NEON ADST
2022-09-14 martin tools: Allocate the priv structs with proper alignment
2022-09-08 gramner x86: Fix clipping in 10bpc SSE4.1 IDCT asm
2022-09-08 gramner build: Improve Windows linking options
2022-09-08 gramner tools: Improve demuxer probing
2022-08-30 code CI: Disable trimming on some tests
2022-08-30 code CI: Remove git 'safe.directory' config
2022-08-30 code gcovr: Ignore parsing errors
2022-08-30 code crossfiles: Update Android toolchains
2022-08-30 code CI: Update images
(...)
2022-09-01 victorien checkasm: Add short options
2022-09-01 victorien checkasm: Add pattern matching to --test
2022-09-01 victorien checkasm: Remove pattern matching from --bench
2022-08-29 victorien checkasm: Add a --function option
2022-08-30 victorien threading: Fix copy_lpf_progress initialization
2022-08-19 jamrial data: don't overwrite the Dav1dDataProps size value
2022-07-18 gramner Adjust inlining attributes on some functions
2022-07-19 gramner x86: Remove leftover instruction in loopfilter AVX2 asm
2022-06-07 david_conrad Enable pointer authentication in assembly when building arm64e
2022-06-07 david_conrad Don't trash the return stack buffer in the NEON loop filter
2022-07-03 thresh CI: Removed snap package generation
2022-07-06 gramner Eliminate unused C DSP functions at compile time
2022-07-06 gramner cpu: Inline dav1d_get_cpu_flags()
2022-06-22 gramner x86: Add minor loopfilter asm improvements
2022-06-15 gramner checkasm: Speed up signal handling
2022-06-15 gramner checkasm: Improve seed generation on Windows
2022-06-20 gramner ci: Don't specify a specific MacOS version
2022-06-14 gramner x86: Add high bit-depth loopfilter AVX-512 (Ice Lake) asm
2022-06-13 victorien checkasm/lpf: Use operating dimensions
2022-06-03 gramner checkasm: Print the cpu model and cpuid signature on x86
2022-06-03 gramner checkasm: Add a vzeroupper check on x86
2022-06-02 gramner x86: Add a workaround for quirky AVX-512 hardware behavior
2022-05-31 victorien checkasm: Fix uninitialized variable
2022-05-14 code CI: Update coverage collecting
2022-05-05 code CI: Add a build with the minimum requirements
2022-05-05 code CI: Deactivate git 'safe.directory'
2022-03-24 code CI: Update images
2022-05-25 victorien Fix typo
2022-05-19 gramner x86: Add high bit-depth cdef_filter AVX-512 (Ice Lake) asm
2022-05-20 gramner checkasm: Print --help message to stderr instead of stdout
2022-05-20 gramner checkasm: Split cdef test into separate pri/sec/pri+sec parts
2022-05-20 gramner checkasm: Improve benchmarking of functions that modify their input
2022-05-18 b x86/itx_avx2: fix typo
2022-04-22 code CI: Add gcc12 and clang14 builds with mold linker
2022-04-26 code CI: Trigger documentation rebuild if configuration changes
2022-04-24 code meson/doc: Fix doxygen config
2022-04-28 gramner Use a relaxed memory ordering in dav1d_ref_inc()
2022-04-28 gramner Remove redundant code in dav1d_cdf_thread_unref()
2022-04-28 gramner Inline dav1d_ref_inc()
2022-04-24 code x86/itx: Add 32x8 12bpc AVX2 transforms
2022-04-24 code x86/itx: Add 8x32 12bpc AVX2 transforms
2022-04-24 code x86/itx: Deduplicate dconly code
2022-04-23 code lib: Fix typo in documentation
2022-04-07 jamrial obu: don't output invisible but showable key frames more than once
2022-04-07 jamrial obu: check that the frame referenced by existing_frame_idx is showable
2022-04-07 jamrial obu: check refresh_frame_flags is not equal to allFrames on Intra Only frames
2022-03-29 robux4 remove multipass wait from dav1d_decode_frame
2022-04-07 jamrial picture: ensure the new seq header and op param info flags are attached to the next visible picture in display order
2022-03-31 jamrial lib: add a function to query the decoder frame delay
2022-03-31 jamrial lib: split calculating thread count to its own function
Created with:
roll-dep src/third_party/dav1d/libdav1d
Fixed: 1401571
Change-Id: Ic3cef540a87a2cf411abe6071fd4c9963ea61f75
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4114163
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Dale Curtis <dalecurtis@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1084574}
Reviewed-on: https://codereview.qt-project.org/c/qt/qtwebengine-chromium/+/468619
Reviewed-by: Michal Klocek <michal.klocek@qt.io>
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm')
-rw-r--r-- | chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm | 172 |
1 files changed, 89 insertions, 83 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm index 361ccc3b883..ed83000ac24 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm @@ -30,22 +30,24 @@ SECTION_RODATA 32 +pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 times 4 db 8, 9 times 4 db 0, 1 times 4 db 8, 9 -pw_1: times 16 dw 1 -pw_2: times 16 dw 2 -pw_3: times 16 dw 3 -; 4 and 16 need to be next to each other since they are used as alternates -; depending on whether bitdepth is 10 or 12 -pw_4: times 16 dw 4 -pw_16: times 16 dw 16 -pw_8: times 16 dw 8 -pw_4096: times 16 dw 4096 +pw_1: times 16 dw 1 +pw_2: times 16 dw 2 +pw_3: times 16 dw 3 +pw_4096: times 2 dw 4096 -pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 +; 10bpc/12bpc: +pw_4: times 2 dw 4 + times 2 dw 16 +clip_max: times 2 dw 511 + times 2 dw 2047 +clip_min: times 2 dw -512 + times 2 dw -2048 SECTION .text @@ -398,9 +400,10 @@ SECTION .text pmaxuw m2, [pw_1] ; I psrlw m1, m0, 4 ; H paddw m0, [pw_2] + vpbroadcastd m8, [r11] paddw m0, m0 paddw m0, m2 ; E - REPX {pmullw x, [r11]}, m0, m1, m2 + REPX {pmullw x, m8}, m0, m1, m2 psubw m8, m3, m4 ; p1-p0 psubw m9, m5, m6 ; q1-q0 @@ -430,7 +433,8 @@ SECTION .text pabsw m10, m10 pmaxuw m9, m10 %endif - pcmpgtw m9, [r11] ; !flat8in + vpbroadcastd m10, [r11] + pcmpgtw m9, m10 ; !flat8in psubw m10, m13, m3 ; p2-p1 pabsw m10, m10 @@ -503,7 +507,8 @@ SECTION .text pmaxuw m0, m2 pmaxuw m1, m10 pmaxuw m1, m0 - pcmpgtw m1, [r11] ; !flat8out + vpbroadcastd m0, [r11] + pcmpgtw m1, m0 ; !flat8out por m1, m9 ; !flat8in | !flat8out vpbroadcastd m2, [maskq+8] pand m10, m2, m12 @@ -544,12 +549,8 @@ SECTION .text %endif ; short filter - - vpbroadcastw m0, r7m - pcmpeqw m2, m2 - psrlw m0, 1 ; 511 or 2047 - pxor m2, m0 ; -512 or -2048 - + vpbroadcastd m0, [r11+8*1] ; 511 or 2047 + vpbroadcastd m2, [r11+8*2] ; -512 or -2048 psubw m10, m5, m4 paddw m11, m10, m10 paddw m11, m10 @@ -561,17 +562,18 @@ SECTION .text pminsw m10, m0 pmaxsw m10, m2 pand m8, m10 ; f&=fm - paddw m10, m8, [pw_3] - paddw m8, [pw_4] + vpbroadcastd m10, [pw_4] + paddw m10, m8 + paddw m8, [pw_3] REPX {pminsw x, m0}, m10, m8 psraw m10, 3 ; f2 psraw m8, 3 ; f1 - paddw m4, m10 - psubw m5, m8 + psubw m5, m10 + paddw m4, m8 - paddw m8, [pw_1] - psraw m8, 1 ; f=(f1+1)>>1 - pandn m8, m7, m8 ; f&=!hev + paddw m10, [pw_1] + psraw m10, 1 ; f=(f1+1)>>1 + pandn m8, m7, m10 ; f&=!hev paddw m3, m8 psubw m6, m8 pxor m8, m8 @@ -603,8 +605,8 @@ SECTION .text mova [rsp+ 0*32], m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 - psllw m8, m0, 3 ; p6*8 - paddw m8, [pw_8] + paddw m8, m0, [pw_1] + psllw m8, 3 ; p6*8+8 paddw m10, m2, m7 ; p5+p4 psubw m8, m0 paddw m10, m10 ; (p5+p4)*2 @@ -759,7 +761,6 @@ SECTION .text psubw m8, m15 paddw m8, m0 psrlw m10, m8, 4 - pand m10, m1 %ifidn %2, v mova m9, [tmpq+strideq*1] %else @@ -788,6 +789,7 @@ SECTION .text %if %1 >= 8 ; flat8 filter + vpbroadcastd m7, [pw_4096] %ifidn %2, v mova m0, [tmpq+strideq*0] ; p3 %else @@ -799,43 +801,43 @@ SECTION .text paddw m2, m0 ; p1+p0+p3 paddw m8, m5 ; 2*(p3+p2)+q0 paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 - pmulhrsw m7, m2, [pw_4096] + pmulhrsw m10, m2, m7 paddw m8, m3, m6 psubw m2, m1 paddw m2, m8 - pmulhrsw m8, m2, [pw_4096] + pmulhrsw m8, m2, m7 - paddw m10, m0, m3 - paddw m11, m4, m14 - psubw m2, m10 - paddw m2, m11 - pmulhrsw m10, m2, [pw_4096] + paddw m11, m0, m3 + paddw m1, m4, m14 + psubw m2, m11 + paddw m2, m1 + pmulhrsw m1, m2, m7 paddw m11, m0, m4 + pblendvb m4, m1, m9 paddw m1, m5, m15 psubw m2, m11 paddw m2, m1 - pmulhrsw m11, m2, [pw_4096] + pmulhrsw m11, m2, m7 paddw m2, m6 paddw m2, m15 paddw m1, m13, m5 + pblendvb m5, m11, m9 + pblendvb m13, m10, m9 psubw m2, m1 - pmulhrsw m1, m2, [pw_4096] + pmulhrsw m1, m2, m7 psubw m2, m3 + pblendvb m3, m8, m9 psubw m2, m6 - paddw m0, m15, m14 - paddw m2, m0 - pmulhrsw m2, [pw_4096] + pblendvb m6, m1, m9 + paddw m1, m15, m14 + paddw m2, m1 + pmulhrsw m2, m7 - vpblendvb m13, m13, m7, m9 - vpblendvb m3, m3, m8, m9 - vpblendvb m4, m4, m10, m9 - vpblendvb m5, m5, m11, m9 - vpblendvb m6, m6, m1, m9 - vpblendvb m14, m14, m2, m9 + pblendvb m14, m2, m9 %ifidn %2, v mova [tmpq+strideq*1], m13 ; p2 @@ -844,9 +846,7 @@ SECTION .text mova [dstq+strideq*0], m5 ; q0 mova [dstq+strideq*1], m6 ; q1 mova [dstq+strideq*2], m14 ; q2 -%else - mova m0, [rsp+5*32] -%if %1 == 8 +%elif %1 == 8 TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 ; write 8x16 @@ -871,29 +871,28 @@ SECTION .text vextracti128 [dstq+stride3q -8], m15, 1 lea dstq, [dstq+strideq*4] %else - mova m0, [rsp+6*32] + mova m8, [rsp+6*32] mova m1, [rsp+7*32] mova m2, [rsp+8*32] mova m7, [rsp+9*32] - mova m8, [rsp+5*32] - TRANSPOSE8X8W 0, 1, 2, 7, 8, 13, 3, 4, 9 + TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9 - mova [dstq+strideq*0-16], xm0 + mova [dstq+strideq*0-16], xm8 mova [dstq+strideq*1-16], xm1 mova [dstq+strideq*2-16], xm2 mova [dstq+stride3q -16], xm7 lea tmpq, [dstq+strideq*4] - mova [tmpq+strideq*0-16], xm8 + mova [tmpq+strideq*0-16], xm0 mova [tmpq+strideq*1-16], xm13 mova [tmpq+strideq*2-16], xm3 mova [tmpq+stride3q -16], xm4 lea tmpq, [tmpq+strideq*4] - vextracti128 [tmpq+strideq*0-16], m0, 1 + vextracti128 [tmpq+strideq*0-16], m8, 1 vextracti128 [tmpq+strideq*1-16], m1, 1 vextracti128 [tmpq+strideq*2-16], m2, 1 vextracti128 [tmpq+stride3q -16], m7, 1 lea tmpq, [tmpq+strideq*4] - vextracti128 [tmpq+strideq*0-16], m8, 1 + vextracti128 [tmpq+strideq*0-16], m0, 1 vextracti128 [tmpq+strideq*1-16], m13, 1 vextracti128 [tmpq+strideq*2-16], m3, 1 vextracti128 [tmpq+stride3q -16], m4, 1 @@ -924,39 +923,38 @@ SECTION .text vextracti128 [dstq+stride3q ], m3, 1 lea dstq, [dstq+strideq*4] %endif -%endif %elif %1 == 6 ; flat6 filter - + vpbroadcastd m7, [pw_4096] paddw m8, m3, m4 paddw m8, m13 ; p2+p1+p0 paddw m11, m13, m5 paddw m8, m8 paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 - pmulhrsw m2, m8, [pw_4096] + pmulhrsw m2, m8, m7 paddw m8, m5 paddw m11, m13, m13 paddw m8, m6 psubw m8, m11 - pmulhrsw m10, m8, [pw_4096] + pmulhrsw m10, m8, m7 paddw m8, m6 paddw m11, m13, m3 paddw m8, m14 psubw m8, m11 - pmulhrsw m11, m8, [pw_4096] + pmulhrsw m11, m8, m7 psubw m8, m3 paddw m14, m14 psubw m8, m4 paddw m8, m14 - pmulhrsw m8, [pw_4096] + pmulhrsw m8, m7 - vpblendvb m3, m3, m2, m9 - vpblendvb m4, m4, m10, m9 - vpblendvb m5, m5, m11, m9 - vpblendvb m6, m6, m8, m9 + pblendvb m3, m2, m9 + pblendvb m4, m10, m9 + pblendvb m5, m11, m9 + pblendvb m6, m8, m9 %ifidn %2, v mova [tmpq+strideq*2], m3 ; p1 @@ -982,10 +980,10 @@ INIT_YMM avx2 cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq @@ -1013,7 +1011,7 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ test [maskq+0], mask_bitsd ; vmask[0] jz .end - FILTER 4, v + call .v4 .end: pslld m12, 4 @@ -1023,15 +1021,19 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ sub wd, 4 jg .loop RET +ALIGN function_align +.v4: + FILTER 4, v + ret INIT_YMM avx2 cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 @@ -1058,7 +1060,7 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter - FILTER 4, h + call .h4 jmp .end .no_filter: @@ -1071,15 +1073,19 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ sub hd, 4 jg .loop RET +ALIGN function_align +.h4: + FILTER 4, h + ret INIT_YMM avx2 cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq @@ -1100,7 +1106,7 @@ cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ test [maskq+0], mask_bitsd ; vmask[0] jz .end - FILTER 4, v + call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4 .end: pslld m12, 4 @@ -1115,10 +1121,10 @@ INIT_YMM avx2 cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 @@ -1138,7 +1144,7 @@ cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter - FILTER 4, h + call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4 jmp .end .no_filter: |