diff options
author | Dale Curtis <dalecurtis@chromium.org> | 2022-12-16 22:37:46 +0000 |
---|---|---|
committer | Michael BrĂ¼ning <michael.bruning@qt.io> | 2023-03-27 08:12:03 +0000 |
commit | c885ec409f9b6ffa25e03851729b1bc2ad2005b3 (patch) | |
tree | 0c9f205efc231ede87d2704b2780d1569caf5111 /chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm | |
parent | 0d63fc949d16f3e37ed7ab43d335b9d81cc6fdf7 (diff) | |
download | qtwebengine-chromium-102-based.tar.gz |
[Backport] Security bug 1401571102-based
Manual update of libdav1d to match the version introduced by patch
https://chromium-review.googlesource.com/c/chromium/src/+/4114163:
Roll src/third_party/dav1d/libdav1d/ 87f9a81cd..ed63a7459 (104 commits)
This roll required a few changes to get working:
- "properties" => "built in options" crossfile configuration change due to Meson deprecation.
- generic config creation never worked, so fixed.
- PPC64 configs were never checked in, so switched to generic.
- copyright header changes for generate_sources.
- Updated readme.chromium with potential issues that can arise.
https://chromium.googlesource.com/external/github.com/videolan/dav1d.git/+log/87f9a81cd770..ed63a7459376
$ git log 87f9a81cd..ed63a7459 --date=short --no-merges --format='%ad %ae %s'
2022-12-09 jamrial dav1d: add an option to skip decoding some frame types
2022-12-08 jamrial picture: support creating and freeing refs without tile data
2022-12-07 gramner x86: Add 10bpc 8x32/32x8 itx AVX-512 (Ice Lake) asm
2022-12-07 gramner x86: Add minor DC-only IDCT optimizations
2022-12-13 gramner getbits: Fix assertion failure
2022-12-13 gramner checkasm: Fix integer overflow in refmvs test
2022-01-26 gramner dav1dplay: Update to new libplacebo API
2022-12-09 gramner Add minor getbits improvements
2022-12-09 gramner Add a separate getbits function for getting a single bit
2022-12-09 gramner Remove redundant zeroing in sequence header parsing
2022-12-09 gramner Set the correct default value of initial_display_delay
2022-12-09 jamrial tools: remove the null last entry in inloop_filters_tbl
2022-12-04 lu_zero Do not assume the picture allocation starts as the left edge
2022-11-21 lu_zero ppc: Allocate the correct temp buffer size
2022-11-21 lu_zero ppc: Do not use static const with vec_splats
2022-11-02 charlie.c.hayden Add info to dav1d_send_data docs
2022-10-30 jbeich build: drop -D_DARWIN_C_SOURCE on macOS/iOS after 6b611d36acab
2022-10-30 jbeich build: drop -D_POSIX_C_SOURCE on non-Linux after 6b611d36acab
2022-06-28 victorien threading: Add a pending list for async task insertion
2022-10-26 martin Implement atomic_compare_exchange_strong in the atomic compat headers
2022-10-06 victorien threading: Fix a race around frame completion (frame-mt)
2022-10-07 sebastian Handle host_machine.system() 'ios' and 'tvos' the same way as 'darwin'
2022-09-23 gramner x86: Add 10-bit 8x8/8x16/16x8/16x16 itx AVX-512 (Ice Lake) asm
2022-09-30 gramner Specify hidden visibility for global data symbol declarations
2022-09-28 gramner build: strip() the result of cc.get_define()
2022-09-26 gramner checkasm: Move printf format string to .rodata on x86
2022-09-26 gramner checkasm: Improve 32-bit parameter clobbering on x86-64
2022-09-26 gramner x86: Fix incorrect 32-bit parameter usage in high bit-depth AVX-512 mc
2022-09-09 martin arm: itx: Add clipping to row_clip_min/max in the 10 bpc codepaths
2022-09-15 gramner x86: Fix overflows in 12bpc AVX2 IDCT/IADST
2022-09-15 gramner x86: Fix overflows in 12bpc AVX2 DC-only IDCT
2022-09-15 gramner x86: Fix clipping in high bit-depth AVX2 4x16 IDCT
2022-03-21 martin Don't use gas-preprocessor with clang-cl for arm targets
2022-06-07 david_conrad Fix checking the reference dimesions for the projection process
2022-06-07 david_conrad Fix calculation of OBMC lap dimensions
2022-06-07 david_conrad Support film grain application whose only effect is clipping to video range
2022-06-07 david_conrad Ignore T.35 metadata if the OBU contains no payload
2022-06-07 david_conrad Fix chroma deblock filter size calculation for lossless
2022-06-07 david_conrad Fix rounding in the calculation of initialSubpelX
2022-06-07 david_conrad Fix overflow when saturating dequantized coefficients clipped to 0
2022-06-08 david_conrad Fix overflow in 8-bit NEON ADST
2022-09-14 martin tools: Allocate the priv structs with proper alignment
2022-09-08 gramner x86: Fix clipping in 10bpc SSE4.1 IDCT asm
2022-09-08 gramner build: Improve Windows linking options
2022-09-08 gramner tools: Improve demuxer probing
2022-08-30 code CI: Disable trimming on some tests
2022-08-30 code CI: Remove git 'safe.directory' config
2022-08-30 code gcovr: Ignore parsing errors
2022-08-30 code crossfiles: Update Android toolchains
2022-08-30 code CI: Update images
(...)
2022-09-01 victorien checkasm: Add short options
2022-09-01 victorien checkasm: Add pattern matching to --test
2022-09-01 victorien checkasm: Remove pattern matching from --bench
2022-08-29 victorien checkasm: Add a --function option
2022-08-30 victorien threading: Fix copy_lpf_progress initialization
2022-08-19 jamrial data: don't overwrite the Dav1dDataProps size value
2022-07-18 gramner Adjust inlining attributes on some functions
2022-07-19 gramner x86: Remove leftover instruction in loopfilter AVX2 asm
2022-06-07 david_conrad Enable pointer authentication in assembly when building arm64e
2022-06-07 david_conrad Don't trash the return stack buffer in the NEON loop filter
2022-07-03 thresh CI: Removed snap package generation
2022-07-06 gramner Eliminate unused C DSP functions at compile time
2022-07-06 gramner cpu: Inline dav1d_get_cpu_flags()
2022-06-22 gramner x86: Add minor loopfilter asm improvements
2022-06-15 gramner checkasm: Speed up signal handling
2022-06-15 gramner checkasm: Improve seed generation on Windows
2022-06-20 gramner ci: Don't specify a specific MacOS version
2022-06-14 gramner x86: Add high bit-depth loopfilter AVX-512 (Ice Lake) asm
2022-06-13 victorien checkasm/lpf: Use operating dimensions
2022-06-03 gramner checkasm: Print the cpu model and cpuid signature on x86
2022-06-03 gramner checkasm: Add a vzeroupper check on x86
2022-06-02 gramner x86: Add a workaround for quirky AVX-512 hardware behavior
2022-05-31 victorien checkasm: Fix uninitialized variable
2022-05-14 code CI: Update coverage collecting
2022-05-05 code CI: Add a build with the minimum requirements
2022-05-05 code CI: Deactivate git 'safe.directory'
2022-03-24 code CI: Update images
2022-05-25 victorien Fix typo
2022-05-19 gramner x86: Add high bit-depth cdef_filter AVX-512 (Ice Lake) asm
2022-05-20 gramner checkasm: Print --help message to stderr instead of stdout
2022-05-20 gramner checkasm: Split cdef test into separate pri/sec/pri+sec parts
2022-05-20 gramner checkasm: Improve benchmarking of functions that modify their input
2022-05-18 b x86/itx_avx2: fix typo
2022-04-22 code CI: Add gcc12 and clang14 builds with mold linker
2022-04-26 code CI: Trigger documentation rebuild if configuration changes
2022-04-24 code meson/doc: Fix doxygen config
2022-04-28 gramner Use a relaxed memory ordering in dav1d_ref_inc()
2022-04-28 gramner Remove redundant code in dav1d_cdf_thread_unref()
2022-04-28 gramner Inline dav1d_ref_inc()
2022-04-24 code x86/itx: Add 32x8 12bpc AVX2 transforms
2022-04-24 code x86/itx: Add 8x32 12bpc AVX2 transforms
2022-04-24 code x86/itx: Deduplicate dconly code
2022-04-23 code lib: Fix typo in documentation
2022-04-07 jamrial obu: don't output invisible but showable key frames more than once
2022-04-07 jamrial obu: check that the frame referenced by existing_frame_idx is showable
2022-04-07 jamrial obu: check refresh_frame_flags is not equal to allFrames on Intra Only frames
2022-03-29 robux4 remove multipass wait from dav1d_decode_frame
2022-04-07 jamrial picture: ensure the new seq header and op param info flags are attached to the next visible picture in display order
2022-03-31 jamrial lib: add a function to query the decoder frame delay
2022-03-31 jamrial lib: split calculating thread count to its own function
Created with:
roll-dep src/third_party/dav1d/libdav1d
Fixed: 1401571
Change-Id: Ic3cef540a87a2cf411abe6071fd4c9963ea61f75
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/4114163
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Dale Curtis <dalecurtis@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1084574}
Reviewed-on: https://codereview.qt-project.org/c/qt/qtwebengine-chromium/+/468619
Reviewed-by: Michal Klocek <michal.klocek@qt.io>
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm')
-rw-r--r-- | chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm | 718 |
1 files changed, 345 insertions, 373 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm index c09dced418b..0218b624d3c 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm @@ -80,25 +80,24 @@ SECTION .text punpckhwd m%1, m%3 kmovw k1, k6 lea t0, [dstq+strideq*4] - vpscatterdd [dstq+m29-2]{k1}, m%4 + vpscatterdd [dstq+m19-2]{k1}, m%4 kmovw k1, k6 lea t1, [dstq+strideq*8] - vpscatterdd [t0 +m29-2]{k1}, m%5 + vpscatterdd [t0 +m19-2]{k1}, m%5 kmovw k1, k6 lea t2, [t0 +strideq*8] - vpscatterdd [t1 +m29-2]{k1}, m%2 + vpscatterdd [t1 +m19-2]{k1}, m%2 kmovw k1, k6 - vpscatterdd [t2 +m29-2]{k1}, m%1 + vpscatterdd [t2 +m19-2]{k1}, m%1 %endmacro %macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem %if %1 == 0 - SWAP m16, m15 + SWAP m16, m22 %endif - ; input in m0-15 - punpcklbw m15, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m2, m3 + punpcklbw m22, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 @@ -108,21 +107,21 @@ SECTION .text punpckhbw m8, m9 punpcklbw m9, m10, m11 punpckhbw m10, m11 - punpcklbw m11, m12, m13 - punpckhbw m12, m13 + punpcklbw m11, m25, m13 + punpckhbw m25, m13 %if %1 == 0 SWAP m13, m16 %else mova m13, %3 %endif - SWAP m16, m12 - punpcklbw m12, m14, m13 + SWAP m16, m25 + punpcklbw m25, m14, m13 punpckhbw m13, m14, m13 - ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 - punpcklwd m14, m15, m1 - punpckhwd m15, m1 - punpcklwd m1, m0, m2 - punpckhwd m0, m2 + ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13 + punpcklwd m14, m22, m26 + punpckhwd m22, m26 + punpcklwd m26, m24, m2 + punpckhwd m24, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 @@ -131,58 +130,58 @@ SECTION .text punpckhwd m7, m9 punpcklwd m9, m8, m10 punpckhwd m8, m10 - punpcklwd m10, m11, m12 - punpckhwd m11, m12 - SWAP m12, m16, m11 - punpcklwd m11, m12, m13 - punpckhwd m12, m13 - ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 + punpcklwd m10, m11, m25 + punpckhwd m11, m25 + SWAP m25, m16, m11 + punpcklwd m11, m25, m13 + punpckhwd m25, m13 + ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25 punpckldq m13, m14, m2 punpckhdq m14, m2 - punpckldq m2, m15, m3 - punpckhdq m15, m3 - punpckldq m3, m1, m5 - punpckhdq m1, m5 - punpckldq m5, m0, m4 - punpckhdq m0, m4 + punpckldq m2, m22, m3 + punpckhdq m22, m3 + punpckldq m3, m26, m5 + punpckhdq m26, m5 + punpckldq m5, m24, m4 + punpckhdq m24, m4 punpckldq m4, m6, m10 punpckhdq m6, m10 punpckldq m10, m9, m11 punpckhdq m9, m11 - punpckldq m11, m8, m12 - punpckhdq m8, m12 - SWAP m12, m16, m8 - punpckldq m8, m7, m12 - punpckhdq m7, m12 - ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 - punpcklqdq m12, m13, m4 + punpckldq m11, m8, m25 + punpckhdq m8, m25 + SWAP m25, m16, m8 + punpckldq m8, m7, m25 + punpckhdq m7, m25 + ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3 + punpcklqdq m25, m13, m4 punpckhqdq m13, m4 punpcklqdq m4, m14, m6 punpckhqdq m14, m6 punpcklqdq m6, m2, m8 punpckhqdq m2, m8 - punpcklqdq m8, m15, m7 - punpckhqdq m15, m7 + punpcklqdq m8, m22, m7 + punpckhqdq m22, m7 punpcklqdq m7, m3, m10 punpckhqdq m3, m10 - punpcklqdq m10, m1, m9 - punpckhqdq m1, m9 + punpcklqdq m10, m26, m9 + punpckhqdq m26, m9 punpcklqdq m9, m5, m11 punpckhqdq m5, m11 SWAP m11, m16 %if %2 == 0 - SWAP m16, m12 + SWAP m16, m25 %else - mova %3, m12 + mova %3, m25 %endif - punpcklqdq m12, m0, m11 - punpckhqdq m0, m11 + punpcklqdq m25, m24, m11 + punpckhqdq m24, m11 %if %2 == 0 SWAP m11, m16 %endif - ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 - SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 - SWAP 3, 14, 12, 9 + ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24 + SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22 + SWAP 3, 14, 25, 9 %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] @@ -205,7 +204,7 @@ SECTION .text %endif lea t0, [dstq+mstrideq*4] %if %1 != 6 - mova m12, [t0 +strideq*0] + mova m25, [t0 +strideq*0] %endif mova m13, [t0 +strideq*1] mova m3, [t0 +strideq*2] @@ -214,13 +213,13 @@ SECTION .text mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 - mova m15, [dstq+stride3q ] + mova m22, [dstq+stride3q ] %endif %if %1 == 16 lea t0, [dstq+strideq*4] - mova m19, [t0 +strideq*0] - mova m20, [t0 +strideq*1] - mova m21, [t0 +strideq*2] + mova m29, [t0 +strideq*0] + mova m30, [t0 +strideq*1] + mova m31, [t0 +strideq*2] %endif %endif %else ; h @@ -230,15 +229,15 @@ SECTION .text vbroadcasti32x4 m0, [hshuf4] kmovw k1, k6 lea t0, [dstq+strideq*4] - vpgatherdd m3{k1}, [dstq+m29-2] + vpgatherdd m3{k1}, [dstq+m19-2] kmovw k1, k6 lea t1, [dstq+strideq*8] - vpgatherdd m4{k1}, [t0 +m29-2] + vpgatherdd m4{k1}, [t0 +m19-2] kmovw k1, k6 lea t2, [t0 +strideq*8] - vpgatherdd m5{k1}, [t1 +m29-2] + vpgatherdd m5{k1}, [t1 +m19-2] kmovw k1, k6 - vpgatherdd m6{k1}, [t2 +m29-2] + vpgatherdd m6{k1}, [t2 +m19-2] pshufb m3, m0 pshufb m4, m0 pshufb m5, m0 @@ -257,16 +256,16 @@ SECTION .text %elif %1 == 6 || %1 == 8 kmovb k1, k7 lea t0, [dstq+strideq*1] - vpgatherdq m3{k1}, [dstq+ym31-%1/2] + vpgatherdq m3{k1}, [dstq+ym21-%1/2] kmovb k1, k7 lea t1, [dstq+strideq*2] - vpgatherdq m4{k1}, [t0 +ym31-%1/2] + vpgatherdq m4{k1}, [t0 +ym21-%1/2] kmovb k1, k7 lea t2, [dstq+stride3q ] - vpgatherdq m5{k1}, [t1 +ym31-%1/2] + vpgatherdq m5{k1}, [t1 +ym21-%1/2] kmovb k1, k7 - vextracti32x8 ym0, m31, 1 - vpgatherdq m6{k1}, [t2 +ym31-%1/2] + vextracti32x8 ym0, m21, 1 + vpgatherdq m6{k1}, [t2 +ym21-%1/2] kmovb k1, k7 vpgatherdq m12{k1}, [dstq+ym0 -%1/2] kmovb k1, k7 @@ -344,7 +343,7 @@ SECTION .text punpckhqdq m13, m5, m13 %if %1 == 8 punpcklqdq m5, m7, m12 - punpckhqdq m12, m7, m12 + punpckhqdq m25, m7, m12 ; xm3: A0-15 ; xm14: B0-15 ; xm15: C0-15 @@ -352,10 +351,11 @@ SECTION .text ; xm4: E0-15 ; xm13: F0-15 ; xm5: G0-15 - ; xm12: H0-15 - SWAP 12, 3, 15 + ; xm25: H0-15 + SWAP 25, 3, 15 SWAP 13, 14, 5, 4, 6 - ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 + SWAP 15, 22 + ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22 %else SWAP 13, 3, 14 SWAP 6, 4, 15, 5 @@ -364,8 +364,8 @@ SECTION .text %else ; 16, h ; load and 16x16 transpose. We only use 14 pixels but we'll need the ; remainder at the end for the second transpose - movu xm0, [dstq+strideq*0-8] - movu xm1, [dstq+strideq*1-8] + movu xm24, [dstq+strideq*0-8] + movu xm26, [dstq+strideq*1-8] movu xm2, [dstq+strideq*2-8] movu xm3, [dstq+stride3q -8] lea t0, [dstq+strideq*4] @@ -379,13 +379,13 @@ SECTION .text movu xm10, [t0 +strideq*2-8] movu xm11, [t0 +stride3q -8] lea t0, [t0 +strideq*4] - movu xm12, [t0 +strideq*0-8] + movu xm25, [t0 +strideq*0-8] movu xm13, [t0 +strideq*1-8] movu xm14, [t0 +strideq*2-8] - movu xm15, [t0 +stride3q -8] + movu xm22, [t0 +stride3q -8] lea t0, [t0 +strideq*4] - vinserti32x4 ym0, [t0 +strideq*0-8], 1 - vinserti32x4 ym1, [t0 +strideq*1-8], 1 + vinserti32x4 ym24, [t0 +strideq*0-8], 1 + vinserti32x4 ym26, [t0 +strideq*1-8], 1 vinserti32x4 ym2, [t0 +strideq*2-8], 1 vinserti32x4 ym3, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] @@ -399,13 +399,13 @@ SECTION .text vinserti32x4 ym10, [t0 +strideq*2-8], 1 vinserti32x4 ym11, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] - vinserti32x4 ym12, [t0 +strideq*0-8], 1 + vinserti32x4 ym25, [t0 +strideq*0-8], 1 vinserti32x4 ym13, [t0 +strideq*1-8], 1 vinserti32x4 ym14, [t0 +strideq*2-8], 1 - vinserti32x4 ym15, [t0 +stride3q -8], 1 + vinserti32x4 ym22, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] - vinserti32x4 m0, [t0 +strideq*0-8], 2 - vinserti32x4 m1, [t0 +strideq*1-8], 2 + vinserti32x4 m24, [t0 +strideq*0-8], 2 + vinserti32x4 m26, [t0 +strideq*1-8], 2 vinserti32x4 m2, [t0 +strideq*2-8], 2 vinserti32x4 m3, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] @@ -419,13 +419,13 @@ SECTION .text vinserti32x4 m10, [t0 +strideq*2-8], 2 vinserti32x4 m11, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] - vinserti32x4 m12, [t0 +strideq*0-8], 2 + vinserti32x4 m25, [t0 +strideq*0-8], 2 vinserti32x4 m13, [t0 +strideq*1-8], 2 vinserti32x4 m14, [t0 +strideq*2-8], 2 - vinserti32x4 m15, [t0 +stride3q -8], 2 + vinserti32x4 m22, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] - vinserti32x4 m0, [t0 +strideq*0-8], 3 - vinserti32x4 m1, [t0 +strideq*1-8], 3 + vinserti32x4 m24, [t0 +strideq*0-8], 3 + vinserti32x4 m26, [t0 +strideq*1-8], 3 vinserti32x4 m2, [t0 +strideq*2-8], 3 vinserti32x4 m3, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] @@ -439,41 +439,38 @@ SECTION .text vinserti32x4 m10, [t0 +strideq*2-8], 3 vinserti32x4 m11, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] - vinserti32x4 m12, [t0 +strideq*0-8], 3 + vinserti32x4 m25, [t0 +strideq*0-8], 3 vinserti32x4 m13, [t0 +strideq*1-8], 3 vinserti32x4 m14, [t0 +strideq*2-8], 3 - vinserti32x4 m15, [t0 +stride3q -8], 3 + vinserti32x4 m22, [t0 +stride3q -8], 3 ; TRANSPOSE_16X16B 0, 1, [rsp+0*64] - SWAP m16, m1 + SWAP m16, m26 SWAP m17, m2 SWAP m18, m3 - SWAP m19, m12 - SWAP m20, m13 - SWAP m21, m14 - mova [rsp+4*64], m15 - ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 - SWAP 12, 4, 7 + SWAP m29, m25 + SWAP m30, m13 + SWAP m31, m14 + mova [rsp+4*64], m22 + ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22 + SWAP 25, 4, 7 SWAP 13, 5, 8 SWAP 3, 6, 9 SWAP 10, 14 - SWAP 11, 15 + SWAP 11, 22 %endif %endif ; load L/E/I/H -%if is_uv - SWAP m22, m15 -%endif - vpbroadcastd m22, [pb_1] + vpbroadcastd m15, [pb_1] %ifidn %2, v movu m1, [lq] movu m0, [lq+l_strideq] %else kmovw k1, k6 - vpgatherdd m0{k1}, [lq+m30+4] + vpgatherdd m0{k1}, [lq+m20+4] kmovw k1, k6 - vpgatherdd m1{k1}, [lq+m30+0] + vpgatherdd m1{k1}, [lq+m20+0] %endif pxor m2, m2 pcmpeqb k1, m0, m2 @@ -484,7 +481,7 @@ SECTION .text pand m2, [pb_63]{bcstd} vpbroadcastb m1, [lutq+136] pminub m2, m1 - pmaxub m2, m22 ; I + pmaxub m2, m15 ; I pand m1, m0, [pb_240]{bcstd} psrlq m1, 4 ; H paddd m0, [pb_2]{bcstd} @@ -500,7 +497,7 @@ SECTION .text ABSSUB m9, m13, m4, m10 ; abs(p2-p0) pmaxub m9, m8 %else - ABSSUB m9, m12, m4, m10 ; abs(p3-p0) + ABSSUB m9, m25, m4, m10 ; abs(p3-p0) pmaxub m9, m8 ABSSUB m10, m13, m4, m11 ; abs(p2-p0) pmaxub m9, m10 @@ -508,17 +505,17 @@ SECTION .text ABSSUB m10, m5, m14, m11 ; abs(q2-q0) pmaxub m9, m10 %if %1 != 6 - ABSSUB m10, m5, m15, m11 ; abs(q3-q0) + ABSSUB m10, m5, m22, m11 ; abs(q3-q0) pmaxub m9, m10 %endif - vpcmpub k2{k3}, m9, m22, 2 ; le ; flat8in + vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in %if %1 == 6 ABSSUB m10, m13, m3, m1 ; abs(p2-p1) %else - ABSSUB m10, m12, m13, m11 ; abs(p3-p2) + ABSSUB m10, m25, m13, m11 ; abs(p3-p2) ABSSUB m11, m13, m3, m1 ; abs(p2-p1) pmaxub m10, m11 - ABSSUB m11, m14, m15, m1 ; abs(q3-q2) + ABSSUB m11, m14, m22, m1 ; abs(q3-q2) pmaxub m10, m11 %endif ABSSUB m11, m14, m6, m1 ; abs(q2-q1) @@ -526,16 +523,10 @@ SECTION .text %if %1 == 16 vpbroadcastd m11, [maskq+8] por m11, [maskq+4]{bcstd} - pand m11, pbmask %else - %if !is_h || %1 == 6 - pand m11, pbmask, [maskq+4]{bcstd} - %else vpbroadcastd m11, [maskq+4] - pand m11, pbmask - %endif %endif - pcmpeqd k4, m11, pbmask + vptestmd k4, m11, pbmask vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks pmaxub m8, m10 %endif @@ -554,77 +545,58 @@ SECTION .text pmaxub m1, m2 ABSSUB m2, m18, m4, m10 pmaxub m1, m2 - ABSSUB m2, m19, m5, m10 + ABSSUB m2, m29, m5, m10 pmaxub m1, m2 - ABSSUB m2, m20, m5, m10 + ABSSUB m2, m30, m5, m10 pmaxub m1, m2 - ABSSUB m2, m21, m5, m10 + ABSSUB m2, m31, m5, m10 pmaxub m1, m2 - ; - vpcmpub k4, m1, m22, 2 ; flat8out - kandq k4, k4, k2 ; flat8in & flat8out - + kandq k2, k2, k3 + vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out vpbroadcastd m2, [maskq+8] - pand m10, m2, pbmask - pcmpeqd k5, m10, pbmask + vptestmd k5, m2, pbmask vpmovm2d m7, k5 - vpmovb2m k5, m7 - kandq k4, k4, k5 ; flat16 - kandq k4, k3, k4 ; flat16 & fm + vptestmb k4{k4}, m7, m7 ; flat16 & fm por m10, m2, [maskq+4]{bcstd} - pand m2, m10, pbmask - pcmpeqd k5, m2, pbmask + vptestmd k5, m10, pbmask vpmovm2d m7, k5 - vpmovb2m k5, m7 - kandq k2, k2, k5 ; flat8in - kandq k2, k3, k2 + vptestmb k2{k2}, m7, m7 ; flat8in por m2, m10, [maskq+0]{bcstd} - pand m2, pbmask - pcmpeqd k5, m2, pbmask + vptestmd k5, m2, pbmask vpmovm2d m7, k5 - vpmovb2m k5, m7 - kandq k3, k3, k5 + vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 & !flat16 kandnq k2, k4, k2 ; flat8 & !flat16 %elif %1 != 4 vpbroadcastd m0, [maskq+4] - pand m2, m0, pbmask - pcmpeqd k4, m2, pbmask + vptestmd k4, m0, pbmask vpmovm2d m7, k4 - vpmovb2m k4, m7 - kandq k2, k2, k4 + vptestmb k2{k2}, m7, m7 kandq k2, k2, k3 ; flat8 & fm por m0, [maskq+0]{bcstd} - pand m0, pbmask - pcmpeqd k4, m0, pbmask + vptestmd k4, m0, pbmask vpmovm2d m7, k4 - vpmovb2m k4, m7 - kandq k3, k3, k4 + vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 %else %ifidn %2, v - pand m0, pbmask, [maskq+0]{bcstd} + vptestmd k4, pbmask, [maskq+0]{bcstd} %else vpbroadcastd m0, [maskq+0] - pand m0, pbmask + vptestmd k4, m0, pbmask %endif - pcmpeqd k4, m0, pbmask vpmovm2d m7, k4 - vpmovb2m k4, m7 - kandq k3, k3, k4 ; fm + vptestmb k3{k3}, m7, m7 ; fm %endif ; short filter -%if is_uv - SWAP m23, m22 - SWAP m24, m0 - SWAP m25, m12 - SWAP m26, m1 +%if %1 >= 8 + SWAP m23, m15 %endif - vpbroadcastd m23, [pb_3] - vpbroadcastd m24, [pb_4] - vpbroadcastd m25, [pb_16] - vpbroadcastd m26, [pb_64] + vpbroadcastd m15, [pb_3] + vpbroadcastd m0, [pb_4] + vpbroadcastd m12, [pb_16] + vpbroadcastd m1, [pb_64] pxor m3, pb128 pxor m6, pb128 psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev @@ -634,16 +606,16 @@ SECTION .text paddsb m10, m11 paddsb m10, m11 paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm - paddsb m8, m10, m23 - paddsb m10, m24 + paddsb m8, m10, m15 + paddsb m10, m0 pand m8, [pb_248]{bcstd} pand m10, [pb_248]{bcstd} psrlq m8, 3 psrlq m10, 3 - pxor m8, m25 - pxor m10, m25 - psubb m8, m25 ; f2 - psubb m10, m25 ; f1 + pxor m8, m12 + pxor m10, m12 + psubb m8, m12 ; f2 + psubb m10, m12 ; f1 paddsb m4, m8 psubsb m5, m10 pxor m4, pb128 @@ -652,7 +624,7 @@ SECTION .text pxor m10, pb128 pxor m8, m8 pavgb m8, m10 ; f=(f1+1)>>1 - psubb m8, m26 + psubb m8, m1 knotq k1, k1 paddsb m3{k1}, m3, m8 psubsb m6{k1}, m6, m8 @@ -664,40 +636,40 @@ SECTION .text %ifidn %2, v lea t0, [dstq+mstrideq*8] %endif - SWAP m0, m16, m14 - SWAP m2, m17, m15 + SWAP m24, m16, m14 + SWAP m2, m17, m22 SWAP m7, m18 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A ; write -6 - vpbroadcastd m26, [pb_7_1] - vpbroadcastd m25, [pb_2] - punpcklbw m14, m0, m12 - punpckhbw m15, m0, m12 - pmaddubsw m10, m14, m26 - pmaddubsw m11, m15, m26 ; p6*7+p3 + vpbroadcastd m1, [pb_7_1] + vpbroadcastd m12, [pb_2] + punpcklbw m14, m24, m25 + punpckhbw m22, m24, m25 + pmaddubsw m10, m14, m1 + pmaddubsw m11, m22, m1 ; p6*7+p3 punpcklbw m8, m2, m7 punpckhbw m9, m2, m7 - pmaddubsw m8, m25 - pmaddubsw m9, m25 + pmaddubsw m8, m12 + pmaddubsw m9, m12 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3 %ifidn %2, h vpbroadcastd m27, [pw_2048] - vpbroadcastd m26, [pb_m1_1] + vpbroadcastd m1, [pb_m1_1] %define pw2048 m27 - %define pbm1_1 m26 + %define pbm1_1 m1 %endif punpcklbw m8, m13, m3 punpckhbw m9, m13, m3 - pmaddubsw m8, m22 - pmaddubsw m9, m22 + pmaddubsw m8, m23 + pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 punpcklbw m8, m4, m5 punpckhbw m9, m4, m5 - pmaddubsw m8, m22 - pmaddubsw m9, m22 + pmaddubsw m8, m23 + pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 pmulhrsw m8, m10, pw2048 @@ -713,17 +685,17 @@ SECTION .text ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B ; write -5 pmaddubsw m14, pbm1_1 - pmaddubsw m15, pbm1_1 + pmaddubsw m22, pbm1_1 paddw m10, m14 - paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 - punpcklbw m8, m0, m6 - punpckhbw m9, m0, m6 + paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m8, m24, m6 + punpckhbw m9, m24, m6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 SWAP m18, m8 - SWAP m22, m9 + SWAP m23, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 @@ -737,8 +709,8 @@ SECTION .text ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C ; write -4 SWAP m14, m16 - punpcklbw m8, m0, m13 - punpckhbw m9, m0, m13 + punpcklbw m8, m24, m13 + punpckhbw m9, m24, m13 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 @@ -756,21 +728,21 @@ SECTION .text %ifidn %2, v vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3 %else - vpblendmb m8{k4}, m12, m8 + vpblendmb m8{k4}, m25, m8 mova [rsp+3*64], m8 %endif ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D ; write -3 - SWAP m15, m17 - punpcklbw m8, m0, m3 - punpckhbw m9, m0, m3 + SWAP m22, m17 + punpcklbw m8, m24, m3 + punpckhbw m9, m24, m3 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 - punpcklbw m8, m7, m15 - punpckhbw m7, m15 + punpcklbw m8, m7, m22 + punpckhbw m7, m22 pmaddubsw m8, pbm1_1 pmaddubsw m7, pbm1_1 paddw m10, m8 @@ -779,69 +751,69 @@ SECTION .text pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 - vpblendmb m23{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F + vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E ; write -2 %ifidn %2, v lea t0, [dstq+strideq*4] %endif - punpcklbw m8, m0, m4 - punpckhbw m9, m0, m4 + punpcklbw m8, m24, m4 + punpckhbw m9, m24, m4 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 - punpcklbw m8, m12, m19 - punpckhbw m9, m12, m19 - SWAP m1, m19 + punpcklbw m8, m25, m29 + punpckhbw m9, m25, m29 + SWAP m26, m29 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 - SWAP m19, m8 - SWAP m24, m9 + SWAP m29, m8 + SWAP m0, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 - vpblendmb m25{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G + vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F ; write -1 %ifidn %2, h - SWAP m28, m0 + SWAP m28, m24 punpcklbw m8, m28, m5 - punpckhbw m0, m28, m5 + punpckhbw m24, m28, m5 %else - punpcklbw m8, m0, m5 - punpckhbw m0, m5 + punpcklbw m8, m24, m5 + punpckhbw m24, m5 %endif pmaddubsw m8, pbm1_1 - pmaddubsw m0, pbm1_1 + pmaddubsw m24, pbm1_1 paddw m10, m8 - paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 - punpcklbw m0, m13, m20 - punpckhbw m9, m13, m20 + paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m24, m13, m30 + punpckhbw m9, m13, m30 %ifidn %2, h - SWAP m27, m20 + SWAP m27, m30 %endif - SWAP m13, m23 - pmaddubsw m0, pbm1_1 + SWAP m13, m15 + pmaddubsw m24, pbm1_1 pmaddubsw m9, pbm1_1 - paddw m10, m0 + paddw m10, m24 paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 - SWAP m20, m0 - SWAP m23, m9 + SWAP m30, m24 + SWAP m15, m9 %ifidn %2, h - SWAP m9, m0 + SWAP m9, m24 %define pw2048 m9 %endif - pmulhrsw m0, m10, pw2048 + pmulhrsw m24, m10, pw2048 pmulhrsw m8, m11, pw2048 paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 - paddw m11, m22 - packuswb m0, m8 - punpcklbw m8, m3, m21 + paddw m11, m23 + packuswb m24, m8 + punpcklbw m8, m3, m31 pmaddubsw m8, pbm1_1 paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 SWAP m18, m8 @@ -851,34 +823,34 @@ SECTION .text SWAP m16, m9 %define pw2048 m16 %endif - punpckhbw m9, m3, m21 - SWAP m3, m25 + punpckhbw m9, m3, m31 + SWAP m3, m12 pmaddubsw m9, pbm1_1 paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 - SWAP m22, m9 + SWAP m23, m9 pmulhrsw m9, m11, pw2048 paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 %ifidn %2, h - SWAP m2, m26 + SWAP m2, m1 %define pbm1_1 m2 %endif - vpblendmb m26{k4}, m4, m0 ; don't clobber p0/m4 since we need it in H + vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G ; write +0 - SWAP m0, m21 ; q6 + SWAP m24, m31 ; q6 packuswb m8, m9 %ifidn %2, h - SWAP m21, m2 - %define pbm1_1 m21 + SWAP m31, m2 + %define pbm1_1 m31 %endif - vpblendmb m25{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I + vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H ; write +1 - punpcklbw m8, m4, m0 - punpckhbw m2, m4, m0 - SWAP m4, m26 + punpcklbw m8, m4, m24 + punpckhbw m2, m4, m24 + SWAP m4, m1 pmaddubsw m8, pbm1_1 pmaddubsw m2, pbm1_1 paddw m10, m8 @@ -892,9 +864,9 @@ SECTION .text ; write +2 paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 paddw m11, m7 - punpcklbw m8, m5, m0 - punpckhbw m9, m5, m0 - SWAP m5, m25 + punpcklbw m8, m5, m24 + punpckhbw m9, m5, m24 + SWAP m5, m12 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 @@ -906,10 +878,10 @@ SECTION .text ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 - paddw m10, m19 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 - paddw m11, m24 - punpcklbw m8, m6, m0 - punpckhbw m9, m6, m0 + paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + paddw m11, m0 + punpcklbw m8, m6, m24 + punpckhbw m9, m6, m24 SWAP 2, 6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 @@ -921,20 +893,20 @@ SECTION .text %ifidn %2, v vmovdqu8 [t0+mstrideq]{k4}, m8 %else - SWAP m19, m16 - %define pw2048 m19 - vpblendmb m16{k4}, m15, m8 + SWAP m29, m16 + %define pw2048 m29 + vpblendmb m16{k4}, m22, m8 %endif ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K ; write +4 - paddw m10, m20 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - paddw m11, m23 + paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + paddw m11, m15 %ifidn %2, h - SWAP m23, m8 + SWAP m15, m8 %endif - punpcklbw m8, m14, m0 - punpckhbw m9, m14, m0 + punpcklbw m8, m14, m24 + punpckhbw m9, m14, m24 SWAP 14, 7 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 @@ -946,16 +918,16 @@ SECTION .text %ifidn %2, v vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4 %else - vpblendmb m17{k4}, m1, m8 + vpblendmb m17{k4}, m26, m8 %endif ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L ; write +5 paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - paddw m11, m22 - punpcklbw m8, m15, m0 - punpckhbw m9, m15, m0 - SWAP m20, m0 + paddw m11, m23 + punpcklbw m8, m22, m24 + punpckhbw m9, m22, m24 + SWAP m30, m24 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 @@ -979,26 +951,26 @@ SECTION .text vpbroadcastd m9, [pb_3_1] vpbroadcastd m10, [pb_2_1] %if %1 == 16 - vpbroadcastd m22, [pb_1] - vpbroadcastd m24, [pb_4] + vpbroadcastd m23, [pb_1] + vpbroadcastd m0, [pb_4] %elifidn %2, h - vpbroadcastd m21, [pb_m1_1] - %define pbm1_1 m21 + vpbroadcastd m31, [pb_m1_1] + %define pbm1_1 m31 %endif - punpcklbw m0, m12, m3 - punpckhbw m1, m12, m3 - pmaddubsw m2, m0, m9 - pmaddubsw m7, m1, m9 ; 3 * p3 + p1 + punpcklbw m24, m25, m3 + punpckhbw m26, m25, m3 + pmaddubsw m2, m24, m9 + pmaddubsw m7, m26, m9 ; 3 * p3 + p1 punpcklbw m8, m13, m4 punpckhbw m11, m13, m4 pmaddubsw m8, m10 pmaddubsw m11, m10 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 - punpcklbw m8, m5, m24 - punpckhbw m11, m5, m24 - pmaddubsw m8, m22 - pmaddubsw m11, m22 + punpcklbw m8, m5, m0 + punpckhbw m11, m5, m0 + pmaddubsw m8, m23 + pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 psrlw m8, m2, 3 @@ -1015,8 +987,8 @@ SECTION .text %endif %endif - pmaddubsw m8, m0, pbm1_1 - pmaddubsw m11, m1, pbm1_1 + pmaddubsw m8, m24, pbm1_1 + pmaddubsw m11, m26, pbm1_1 paddw m2, m8 paddw m7, m11 punpcklbw m8, m13, m6 @@ -1035,14 +1007,14 @@ SECTION .text SWAP m18, m8 %endif - pmaddubsw m0, m22 - pmaddubsw m1, m22 - psubw m2, m0 - psubw m7, m1 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 punpcklbw m8, m4, m14 punpckhbw m11, m4, m14 - pmaddubsw m8, m22 - pmaddubsw m11, m22 + pmaddubsw m8, m23 + pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 psrlw m8, m2, 3 @@ -1052,19 +1024,19 @@ SECTION .text %ifidn %2, v mova [t0+stride3q], m8 %else - SWAP m19, m8 + SWAP m29, m8 %endif - punpcklbw m0, m5, m15 - punpckhbw m1, m5, m15 - pmaddubsw m8, m0, m22 - pmaddubsw m11, m1, m22 + punpcklbw m24, m5, m22 + punpckhbw m26, m5, m22 + pmaddubsw m8, m24, m23 + pmaddubsw m11, m26, m23 paddw m2, m8 paddw m7, m11 - punpcklbw m8, m4, m12 - punpckhbw m11, m4, m12 - pmaddubsw m8, m22 - pmaddubsw m11, m22 + punpcklbw m8, m4, m25 + punpckhbw m11, m4, m25 + pmaddubsw m8, m23 + pmaddubsw m11, m23 psubw m2, m8 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 psrlw m8, m2, 3 @@ -1075,10 +1047,10 @@ SECTION .text mova [dstq+strideq*0], m11 %endif - pmaddubsw m0, pbm1_1 - pmaddubsw m1, pbm1_1 - paddw m2, m0 - paddw m7, m1 + pmaddubsw m24, pbm1_1 + pmaddubsw m26, pbm1_1 + paddw m2, m24 + paddw m7, m26 punpcklbw m8, m13, m6 punpckhbw m13, m6 pmaddubsw m8, pbm1_1 @@ -1093,18 +1065,18 @@ SECTION .text mova [dstq+strideq*1], m13 %endif - punpcklbw m0, m3, m6 - punpckhbw m1, m3, m6 - pmaddubsw m0, m22 - pmaddubsw m1, m22 - psubw m2, m0 - psubw m7, m1 - punpcklbw m0, m14, m15 - punpckhbw m1, m14, m15 - pmaddubsw m0, m22 - pmaddubsw m1, m22 - paddw m2, m0 - paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + punpcklbw m24, m3, m6 + punpckhbw m26, m3, m6 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 + punpcklbw m24, m14, m22 + punpckhbw m26, m14, m22 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + paddw m2, m24 + paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 @@ -1120,36 +1092,36 @@ SECTION .text %endif %ifidn %2, h - SWAP m0, m18 - SWAP m1, m19 + SWAP m24, m18 + SWAP m26, m29 %if %1 == 8 ; 16x8 transpose - punpcklbw m3, m12, m10 - punpckhbw m12, m10 - punpcklbw m10, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m11, m13 + punpcklbw m3, m25, m10 + punpckhbw m25, m10 + punpcklbw m10, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m11, m13 punpckhbw m11, m13 - punpcklbw m13, m2, m15 - punpckhbw m2, m15 + punpcklbw m13, m2, m22 + punpckhbw m2, m22 ; - punpcklwd m15, m3, m10 + punpcklwd m22, m3, m10 punpckhwd m3, m10 - punpcklwd m10, m12, m0 - punpckhwd m12, m0 - punpcklwd m0, m1, m13 - punpckhwd m1, m13 + punpcklwd m10, m25, m24 + punpckhwd m25, m24 + punpcklwd m24, m26, m13 + punpckhwd m26, m13 punpcklwd m13, m11, m2 punpckhwd m11, m2 ; - punpckldq m2, m15, m0 - punpckhdq m15, m0 - punpckldq m0, m3, m1 - punpckhdq m3, m1 - punpckldq m1, m10, m13 + punpckldq m2, m22, m24 + punpckhdq m22, m24 + punpckldq m24, m3, m26 + punpckhdq m3, m26 + punpckldq m26, m10, m13 punpckhdq m10, m13 - punpckldq m13, m12, m11 - punpckhdq m12, m11 + punpckldq m13, m25, m11 + punpckhdq m25, m11 ; write 8x32 vpbroadcastd ym16, strided pmulld ym16, [hmulD] @@ -1162,8 +1134,8 @@ SECTION .text kmovb k3, k6 kmovb k4, k6 vpscatterdq [dstq+ym16-4]{k1}, m2 - vpscatterdq [t1 +ym16-4]{k2}, m15 - vpscatterdq [t2 +ym16-4]{k3}, m0 + vpscatterdq [t1 +ym16-4]{k2}, m22 + vpscatterdq [t2 +ym16-4]{k3}, m24 vpscatterdq [t3 +ym16-4]{k4}, m3 lea t1, [t0+strideq*2] lea t2, [t0+strideq*4] @@ -1172,29 +1144,29 @@ SECTION .text kmovb k2, k6 kmovb k3, k6 kmovb k4, k6 - vpscatterdq [t0+ym16-4]{k1}, m1 + vpscatterdq [t0+ym16-4]{k1}, m26 vpscatterdq [t1+ym16-4]{k2}, m10 vpscatterdq [t2+ym16-4]{k3}, m13 - vpscatterdq [t3+ym16-4]{k4}, m12 + vpscatterdq [t3+ym16-4]{k4}, m25 %else ; 16x16 transpose and store SWAP 5, 10, 2 - SWAP 6, 0 - SWAP 7, 1 + SWAP 6, 24 + SWAP 7, 26 SWAP 8, 11 SWAP 9, 13 - mova m0, [rsp+0*64] - SWAP m1, m28 + mova m24, [rsp+0*64] + SWAP m26, m28 mova m2, [rsp+1*64] mova m3, [rsp+2*64] mova m4, [rsp+3*64] SWAP m11, m16 - SWAP m12, m17 + SWAP m25, m17 SWAP m13, m27 - SWAP m14, m20 + SWAP m14, m30 TRANSPOSE_16X16B 1, 0, [rsp+4*64] - movu [dstq+strideq*0-8], xm0 - movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*0-8], xm24 + movu [dstq+strideq*1-8], xm26 movu [dstq+strideq*2-8], xm2 movu [dstq+stride3q -8], xm3 lea t0, [dstq+strideq*4] @@ -1208,13 +1180,13 @@ SECTION .text movu [t0+strideq*2-8], xm10 movu [t0+stride3q -8], xm11 lea t0, [t0+strideq*4] - movu [t0+strideq*0-8], xm12 + movu [t0+strideq*0-8], xm25 movu [t0+strideq*1-8], xm13 movu [t0+strideq*2-8], xm14 - movu [t0+stride3q -8], xm15 + movu [t0+stride3q -8], xm22 lea t0, [t0+strideq*4] - vextracti128 [t0+strideq*0-8], ym0, 1 - vextracti128 [t0+strideq*1-8], ym1, 1 + vextracti128 [t0+strideq*0-8], ym24, 1 + vextracti128 [t0+strideq*1-8], ym26, 1 vextracti128 [t0+strideq*2-8], ym2, 1 vextracti128 [t0+stride3q -8], ym3, 1 lea t0, [t0+strideq*4] @@ -1228,13 +1200,13 @@ SECTION .text vextracti128 [t0+strideq*2-8], ym10, 1 vextracti128 [t0+stride3q -8], ym11, 1 lea t0, [t0+strideq*4] - vextracti128 [t0+strideq*0-8], ym12, 1 + vextracti128 [t0+strideq*0-8], ym25, 1 vextracti128 [t0+strideq*1-8], ym13, 1 vextracti128 [t0+strideq*2-8], ym14, 1 - vextracti128 [t0+stride3q -8], ym15, 1 + vextracti128 [t0+stride3q -8], ym22, 1 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m0, 2 - vextracti32x4 [t0+strideq*1-8], m1, 2 + vextracti32x4 [t0+strideq*0-8], m24, 2 + vextracti32x4 [t0+strideq*1-8], m26, 2 vextracti32x4 [t0+strideq*2-8], m2, 2 vextracti32x4 [t0+stride3q -8], m3, 2 lea t0, [t0+strideq*4] @@ -1248,13 +1220,13 @@ SECTION .text vextracti32x4 [t0+strideq*2-8], m10, 2 vextracti32x4 [t0+stride3q -8], m11, 2 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m12, 2 + vextracti32x4 [t0+strideq*0-8], m25, 2 vextracti32x4 [t0+strideq*1-8], m13, 2 vextracti32x4 [t0+strideq*2-8], m14, 2 - vextracti32x4 [t0+stride3q -8], m15, 2 + vextracti32x4 [t0+stride3q -8], m22, 2 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m0, 3 - vextracti32x4 [t0+strideq*1-8], m1, 3 + vextracti32x4 [t0+strideq*0-8], m24, 3 + vextracti32x4 [t0+strideq*1-8], m26, 3 vextracti32x4 [t0+strideq*2-8], m2, 3 vextracti32x4 [t0+stride3q -8], m3, 3 lea t0, [t0+strideq*4] @@ -1268,19 +1240,15 @@ SECTION .text vextracti32x4 [t0+strideq*2-8], m10, 3 vextracti32x4 [t0+stride3q -8], m11, 3 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m12, 3 + vextracti32x4 [t0+strideq*0-8], m25, 3 vextracti32x4 [t0+strideq*1-8], m13, 3 vextracti32x4 [t0+strideq*2-8], m14, 3 - vextracti32x4 [t0+stride3q -8], m15, 3 + vextracti32x4 [t0+stride3q -8], m22, 3 %endif %endif %elif %1 == 6 ; flat6 filter - SWAP m15, m23 - SWAP m0, m24 - SWAP m12, m25 - SWAP m1, m26 vpbroadcastd m15, [pb_3_1] vpbroadcastd m12, [pb_2] punpcklbw m8, m13, m5 @@ -1381,17 +1349,16 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] - mova m31, [pb_4x0_4x4_4x8_4x12] - mova m30, [pb_mask] - vpbroadcastd m29, [pb_128] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] vpbroadcastd m28, [pb_m1_1] vpbroadcastd m27, [pw_2048] - %define pbshuf m31 - %define pbmask m30 - %define pb128 m29 + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 %define pbm1_1 m28 %define pw2048 m27 - %define is_uv 0 .loop: cmp word [maskq+8], 0 ; vmask[2] @@ -1411,7 +1378,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call .v4 .end: add lq, 64 @@ -1420,6 +1387,11 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ sub wd, 16 jg .loop RET +ALIGN function_align +RESET_MM_PERMUTATION +.v4: + FILTER 4, v + ret cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ lut, h, stride3, stride8 @@ -1429,11 +1401,11 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ lea stride3q, [strideq*3] lea stride8q, [strideq*8] kxnorw k6, k6, k6 - vpbroadcastd m29, strided - vpbroadcastd m30, l_strided - pmulld m31, m29, [hmulA] - pmulld m30, m30, [hmulB] - pmulld m29, m29, [hmulC] + vpbroadcastd m19, strided + vpbroadcastd m20, l_strided + pmulld m21, m19, [hmulA] + pmulld m20, [hmulB] + pmulld m19, [hmulC] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask [pb_mask] %define pb128 [pb_128]{bcstd} @@ -1457,7 +1429,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, h + call .h4 .end: lea lq, [lq+l_strideq*8] @@ -1466,9 +1438,13 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ sub hd, 16 jg .loop RET +ALIGN function_align RESET_MM_PERMUTATION +.h4: + FILTER 4, h + ret -cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ +cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \ lut, w, stride3, mstride DECLARE_REG_TMP 9 shl l_strideq, 2 @@ -1476,16 +1452,15 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] - mova m20, [pb_4x0_4x4_4x8_4x12] - mova m19, [pb_mask] - vpbroadcastd m18, [pb_128] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] vpbroadcastd m17, [pb_m1_1] vpbroadcastd m16, [pw_4096] - %define pbshuf m20 - %define pbmask m19 - %define pb128 m18 + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 %define pbm1_1 m17 - %define is_uv 1 .loop: cmp word [maskq+4], 0 ; vmask[1] @@ -1498,7 +1473,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4 .end: add lq, 64 @@ -1525,17 +1500,14 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ vpbroadcastd m19, strided vpbroadcastd m20, l_strided pmulld m21, m19, [hmulA] - pmulld m20, m20, [hmulB] - pmulld m19, m19, [hmulC] + pmulld m20, [hmulB] + pmulld m19, [hmulC] mova m18, [pb_mask] vpbroadcastd m17, [pb_128] vpbroadcastd m16, [pw_4096] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask m18 %define pb128 m17 - %xdefine m31 m21 - %xdefine m30 m20 - %xdefine m29 m19 add l_strideq, l_strideq .loop: @@ -1549,7 +1521,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, h + call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4 .end: lea lq, [lq+l_strideq*8] |