;***************************************************************************** ;* x86-optimized functions for gblur filter ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" SECTION .data gblur_transpose_16x16_indices1: dq 2, 3, 0, 1, 6, 7, 4, 5 gblur_transpose_16x16_indices2: dq 1, 0, 3, 2, 5, 4, 7, 6 gblur_transpose_16x16_indices3: dd 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 gblur_transpose_16x16_mask: dw 0xcc, 0x33, 0xaa, 0x55, 0xaaaa, 0x5555 gblur_vindex_width: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 SECTION .text %xdefine AVX2_MMSIZE 32 %xdefine AVX512_MMSIZE 64 %macro MOVSXDIFNIDN 1-* %rep %0 movsxdifnidn %1q, %1d %rotate 1 %endrep %endmacro %macro KXNOR 2-* %if mmsize == AVX512_MMSIZE kxnorw %2, %2, %2 %else %if %0 == 3 mov %3, -1 %else vpcmpeqd %1, %1, %1 %endif %endif %endmacro %macro KMOVW 2-4 %if mmsize == AVX2_MMSIZE && %0 == 4 mova %1, %2 %elif mmsize == AVX512_MMSIZE %if %0 == 4 %rotate 2 %endif kmovw %1, %2 %endif %endmacro %macro PUSH_MASK 5 %if mmsize == AVX2_MMSIZE %assign %%n mmsize/4 %assign %%i 0 %rep %%n mov %4, %3 and %4, 1 neg %4 mov dword [%5 + %%i*4], %4 sar %3, 1 %assign %%i %%i+1 %endrep movu %1, [%5] %else kmovd %2, %3 %endif %endmacro %macro VMASKMOVPS 4 %if mmsize == AVX2_MMSIZE vpmaskmovd %1, %3, %2 %else kmovw k7, %4 vmovups %1{k7}, %2 %endif %endmacro %macro VGATHERDPS 4 %if mmsize == AVX2_MMSIZE vgatherdps %1, %2, %3 %else vgatherdps %1{%4}, %2 %endif %endmacro %macro VSCATTERDPS128 7 %rep 4 mov %7, %6 and %7, 1 cmp %7, 0 je %%end_scatter movss [%2 + %3*%4], xm%1 vpshufd m%1, m%1, 0x39 add %3, %5 sar %6, 1 %endrep %%end_scatter: %endmacro ; %1=register index ; %2=base address %3=vindex ; %4=scale %5=width ; %6=mask %7=tmp ; m15=reserved %macro VSCATTERDPS256 7 mova m15, m%1 xor %3, %3 VSCATTERDPS128 15, %2, %3, %4, %5, %6, %7 vextractf128 xm15, m%1, 1 VSCATTERDPS128 15, %2, %3, %4, %5, %6, %7 %endmacro ; %1=base address %2=avx2 vindex ; %3=avx512 vindex %4=avx2 mask ; %5=avx512 mask %6=register index ; %7=width %8-*=tmp %macro VSCATTERDPS 8-* %if mmsize == AVX2_MMSIZE %if %0 == 9 mov %9, %4 VSCATTERDPS256 %6, %1, %2, 4, %7, %9, %8 %else VSCATTERDPS256 %6, %1, %2, 4, %7, %4, %8 %endif %else vscatterdps [%1 + %3*4]{%5}, m%6 %endif %endmacro %macro INIT_WORD_MASK 1-* %assign %%i 0 %rep %0 kmovw %1, [gblur_transpose_16x16_mask + %%i * 2] %assign %%i %%i+1 %rotate 1 %endrep %endmacro %macro INIT_INDICES 1-* %assign %%i 1 %rep %0 movu %1, [gblur_transpose_16x16_indices %+ %%i] %assign %%i %%i+1 %rotate 1 %endrep %endmacro %assign stack_offset 0 %macro PUSH_MM 1 %if mmsize == AVX2_MMSIZE movu [rsp + stack_offset], %1 %assign stack_offset stack_offset+mmsize %endif %endmacro %macro POP_MM 1 %if mmsize == AVX2_MMSIZE %assign stack_offset stack_offset-mmsize movu %1, [rsp + stack_offset] %endif %endmacro %macro READ_LOCAL_BUFFER 1 %if mmsize == AVX512_MMSIZE %assign %%i 19 %else %assign %%i 9 %endif %assign %%j %%i-1 %assign %%k %1-1 %xdefine %%m m %+ %%i mova %%m, m3 FMULADD_PS %%m, %%m, m0, [localbufq + %%k * mmsize], %%m %assign %%k %%k-1 %rep %1-1 %xdefine %%m m %+ %%j mova %%m, m %+ %%i FMULADD_PS %%m, %%m, m0, [localbufq + %%k * mmsize], %%m %assign %%i %%i-1 %assign %%j %%j-1 %assign %%k %%k-1 %endrep %if mmsize == AVX512_MMSIZE mova m3, m %+ %%i %endif %endmacro %macro FMADD_WRITE 4 FMULADD_PS %1, %1, %2, %3, %1 mova %4, %1 %endmacro %macro WRITE_LOCAL_BUFFER_INTERNAL 8-16 %assign %%i 0 %rep %0 FMADD_WRITE m3, m0, m %+ %1, [localbufq + %%i * mmsize] %assign %%i %%i+1 %rotate 1 %endrep %endmacro %macro GATHERPS 1 %if mmsize == AVX512_MMSIZE %assign %%i 4 %else %assign %%i 2 %endif movu m %+ %%i, [ptrq] mov strideq, widthq %assign %%i %%i+1 %rep %1-2 movu m %+ %%i, [ptrq + strideq*4] add strideq, widthq %assign %%i %%i+1 %endrep movu m %+ %%i, [ptrq + strideq*4] %endmacro %macro SCATTERPS_INTERNAL 8-16 movu [ptrq + strideq*0], m %+ %1 mov strideq, widthq %rotate 1 %rep %0-2 movu [ptrq + strideq*4], m %+ %1 add strideq, widthq %rotate 1 %endrep movu [ptrq + strideq*4], m %+ %1 %endmacro %macro BATCH_INSERT64X4 4-* %assign %%imm8 %1 %rotate 1 %rep (%0-1)/3 vinserti64x4 m%1, m%2, ym%3, %%imm8 %rotate 3 %endrep %endmacro %macro BATCH_EXTRACT_INSERT 2-* %assign %%imm8 %1 %rotate 1 %rep (%0-1)/2 vextractf64x4 ym%1, m%1, %%imm8 vextractf64x4 ym%2, m%2, %%imm8 vinserti64x4 m%1, m%1, ym%2, %%imm8 %rotate 2 %endrep %endmacro %macro BATCH_MOVE 2-* %rep %0/2 mova m%1, m%2 %rotate 2 %endrep %endmacro %macro BATCH_PERMUTE 3-* %xdefine %%decorator %1 %xdefine %%mask %2 %assign %%index %3 %rotate 3 %rep (%0-3)/2 vperm %+ %%decorator m%1{%%mask}, m %+ %%index, m%2 %rotate 2 %endrep %endmacro ; input : m3-m19 ; output: m8 m5 m9 m15 m16 m7 m17 m27 m24 m21 m25 m19 m12 m23 m13 m11 %macro TRANSPOSE_16X16_AVX512 0 BATCH_INSERT64X4 0x1, 20,4,12, 21,5,13, 22,6,14, 23,7,15 BATCH_INSERT64X4 0x1, 24,8,16, 25,9,17, 26,10,18, 27,11,19 BATCH_EXTRACT_INSERT 0x1, 4,12, 5,13, 6,14, 7,15 BATCH_EXTRACT_INSERT 0x1, 8,16, 9,17, 10,18, 11,19 BATCH_MOVE 12,20, 13,21, 14,22, 15,23 BATCH_PERMUTE q, k6, 28, 12,24, 13,25, 14,26, 15,27 BATCH_PERMUTE q, k5, 28, 24,20, 25,21, 26,22, 27,23 BATCH_MOVE 16,4, 17,5, 18,6, 19,7 BATCH_PERMUTE q, k6, 28, 16,8, 17,9, 18,10, 19,11 BATCH_PERMUTE q, k5, 28, 8,4, 9,5, 10,6, 11,7 BATCH_MOVE 4,12, 5,13, 6,24, 7,25 BATCH_MOVE 20,16, 21,17, 22,8, 23,9 BATCH_PERMUTE q, k4, 29, 4,14, 5,15, 6,26, 7,27 BATCH_PERMUTE q, k3, 29, 14,12, 15,13, 26,24, 27,25 BATCH_PERMUTE q, k4, 29, 20,18, 21,19, 22,10, 23,11 BATCH_PERMUTE q, k3, 29, 18,16, 19,17, 10,8, 11,9 BATCH_MOVE 8,4, 9,14, 16,6, 17,26 BATCH_MOVE 24,20, 25,18, 12,22, 13,10 BATCH_PERMUTE d, k2, 30, 8,5, 9,15, 16,7, 17,27 BATCH_PERMUTE d, k1, 30, 5,4, 15,14, 7,6, 27,26 BATCH_PERMUTE d, k2, 30, 24,21, 25,19, 12,23, 13,11 BATCH_PERMUTE d, k1, 30, 21,20, 19,18, 23,22, 11,10 %endmacro %macro INSERT_UNPACK 8 vinsertf128 m%5, m%1, xm%3, 0x1 vinsertf128 m%6, m%2, xm%4, 0x1 vunpcklpd m%7, m%5, m%6 vunpckhpd m%8, m%5, m%6 %endmacro %macro SHUFFLE 4 vshufps m%3, m%1, m%2, 0x88 vshufps m%4, m%1, m%2, 0xDD mova m%1, m%3 mova m%2, m%4 %endmacro %macro EXTRACT_INSERT_UNPACK 6 vextractf128 xm%1, m%1, 0x1 vextractf128 xm%2, m%2, 0x1 vinsertf128 m%3, m%3, xm%1, 0x0 vinsertf128 m%4, m%4, xm%2, 0x0 vunpcklpd m%5, m%3, m%4 vunpckhpd m%6, m%3, m%4 %endmacro ; Transpose 8x8 AVX2 ; Limit the number ym# register to 16 for compatibility ; Used up registers instead of using stack memory ; Input: m2-m9 ; Output: m12, m14, m13, m15, m8, m10, m9, m11 %macro TRANSPOSE_8X8_AVX2 0 INSERT_UNPACK 2, 3, 6, 7, 10, 11, 12, 13 INSERT_UNPACK 4, 5, 8, 9, 10, 11, 14, 15 SHUFFLE 12, 14, 10, 11 SHUFFLE 13, 15, 10, 11 EXTRACT_INSERT_UNPACK 4, 5, 8, 9, 10, 11 EXTRACT_INSERT_UNPACK 2, 3, 6, 7, 8, 9 SHUFFLE 8, 10, 6, 7 SHUFFLE 9, 11, 6, 7 %endmacro %macro TRANSPOSE 0 %if cpuflag(avx512) TRANSPOSE_16X16_AVX512 %elif cpuflag(avx2) TRANSPOSE_8X8_AVX2 %endif %endmacro %macro WRITE_LOCAL_BUFFER 0 %if cpuflag(avx512) WRITE_LOCAL_BUFFER_INTERNAL 8, 5, 9, 15, 16, 7, 17, 27, \ 24, 21, 25, 19, 12, 23, 13, 11 %elif cpuflag(avx2) WRITE_LOCAL_BUFFER_INTERNAL 12, 14, 13, 15, 8, 10, 9, 11 %endif %endmacro %macro SCATTERPS 0 %if cpuflag(avx512) SCATTERPS_INTERNAL 8, 5, 9, 15, 16, 7, 17, 27, \ 24, 21, 25, 19, 12, 23, 13, 11 %elif cpuflag(avx2) SCATTERPS_INTERNAL 12, 14, 13, 15, 8, 10, 9, 11 %endif %endmacro %macro OPTIMIZED_LOOP_STEP 0 lea stepd, [stepsd - 1] cmp stepd, 0 jle %%bscale_scalar %%loop_step: sub localbufq, mmsize mulps m3, m1 movu [localbufq], m3 ; Filter leftwards lea xq, [widthq - 1] %%loop_step_x_back: sub localbufq, mmsize FMULADD_PS m3, m3, m0, [localbufq], m3 movu [localbufq], m3 dec xq cmp xq, 0 jg %%loop_step_x_back ; Filter rightwards mulps m3, m1 movu [localbufq], m3 add localbufq, mmsize lea xq, [widthq - 1] %%loop_step_x: FMULADD_PS m3, m3, m0, [localbufq], m3 movu [localbufq], m3 add localbufq, mmsize dec xq cmp xq, 0 jg %%loop_step_x dec stepd cmp stepd, 0 jg %%loop_step %%bscale_scalar: %endmacro ;*************************************************************************** ; void ff_horiz_slice(float *ptr, int width, int height, int steps, ; float nu, float bscale) ;*************************************************************************** %macro HORIZ_SLICE 0 %if UNIX64 %if cpuflag(avx512) || cpuflag(avx2) cglobal horiz_slice, 5, 12, mmnum, 0-mmsize*4, buffer, width, height, steps, \ localbuf, x, y, step, stride, remain, ptr, mask %else cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, stride, remain %endif %else %if cpuflag(avx512) || cpuflag(avx2) cglobal horiz_slice, 5, 12, mmnum, 0-mmsize*4, buffer, width, height, steps, nu, bscale, \ localbuf, x, y, step, stride, remain, ptr, mask %else cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, step, stride, remain %endif %endif %if cpuflag(avx512) || cpuflag(avx2) %assign rows mmsize/4 %assign cols mmsize/4 %if WIN64 VBROADCASTSS m0, num ; nu VBROADCASTSS m1, bscalem ; bscale mov nuq, localbufm DEFINE_ARGS buffer, width, height, steps, \ localbuf, x, y, step, stride, remain, ptr, mask %else VBROADCASTSS m0, xmm0 ; nu VBROADCASTSS m1, xmm1 ; bscale %endif MOVSXDIFNIDN width, height, steps %if cpuflag(avx512) vpbroadcastd m2, widthd INIT_WORD_MASK k6, k5, k4, k3, k2, k1 INIT_INDICES m28, m29, m30 %else movd xm2, widthd VBROADCASTSS m2, xm2 %endif vpmulld m2, m2, [gblur_vindex_width] ; vindex width xor yq, yq ; y = 0 xor xq, xq ; x = 0 cmp heightq, rows jl .y_scalar sub heightq, rows .loop_y: ; ptr = buffer + y * width; mov ptrq, yq imul ptrq, widthq lea ptrq, [bufferq + ptrq*4] KXNOR m5, k7 VGATHERDPS m3, [ptrq + m2*4], m5, k7 mulps m3, m1 movu [localbufq], m3 add ptrq, 4 add localbufq, mmsize ; Filter rightwards PUSH_MM m2 lea xq, [widthq - 1] .loop_x: PUSH_MM m3 GATHERPS cols TRANSPOSE POP_MM m3 WRITE_LOCAL_BUFFER add ptrq, mmsize add localbufq, rows * mmsize sub xq, cols cmp xq, cols jge .loop_x POP_MM m2 cmp xq, 0 jle .bscale_scalar .loop_x_scalar: KXNOR m5, k7 VGATHERDPS m4, [ptrq + m2*4], m5, k7 FMULADD_PS m3, m3, m0, m4, m3 movu [localbufq], m3 add ptrq, 0x4 add localbufq, mmsize dec xq cmp xq, 0 jg .loop_x_scalar .bscale_scalar: OPTIMIZED_LOOP_STEP sub ptrq, 4 sub localbufq, mmsize mulps m3, m1 KXNOR m5, k7, maskq VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq ; Filter leftwards PUSH_MM m2 lea xq, [widthq - 1] .loop_x_back: sub localbufq, rows * mmsize READ_LOCAL_BUFFER cols PUSH_MM m2 TRANSPOSE POP_MM m3 sub ptrq, mmsize SCATTERPS sub xq, cols cmp xq, cols jge .loop_x_back POP_MM m2 cmp xq, 0 jle .end_loop_x .loop_x_back_scalar: sub ptrq, 0x4 sub localbufq, mmsize FMULADD_PS m3, m3, m0, [localbufq], m3 KXNOR m5, k7, maskq VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq dec xq cmp xq, 0 jg .loop_x_back_scalar .end_loop_x: add yq, rows cmp yq, heightq jle .loop_y add heightq, rows cmp yq, heightq jge .end_scalar mov remainq, widthq imul remainq, mmsize add ptrq, remainq .y_scalar: mov remainq, heightq sub remainq, yq mov maskq, 1 shlx maskq, maskq, remainq sub maskq, 1 mov remainq, maskq PUSH_MASK m5, k1, remaind, xd, rsp + 0x20 mov ptrq, yq imul ptrq, widthq lea ptrq, [bufferq + ptrq * 4] ; ptrq = buffer + y * width KMOVW m6, m5, k7, k1 VGATHERDPS m3, [ptrq + m2 * 4], m6, k7 mulps m3, m1 ; p0 *= bscale movu [localbufq], m3 add localbufq, mmsize ; Filter rightwards lea xq, [widthq - 1] .y_scalar_loop_x: add ptrq, 4 KMOVW m6, m5, k7, k1 VGATHERDPS m4, [ptrq + m2 * 4], m6, k7 FMULADD_PS m3, m3, m0, m4, m3 movu [localbufq], m3 add localbufq, mmsize dec xq cmp xq, 0 jg .y_scalar_loop_x OPTIMIZED_LOOP_STEP sub localbufq, mmsize mulps m3, m1 ; p0 *= bscale KMOVW k7, k1 VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq, heightq ; Filter leftwards lea xq, [widthq - 1] .y_scalar_loop_x_back: sub ptrq, 4 sub localbufq, mmsize FMULADD_PS m3, m3, m0, [localbufq], m3 KMOVW k7, k1 VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq, heightq dec xq cmp xq, 0 jg .y_scalar_loop_x_back .end_scalar: RET %else %if WIN64 movss m0, num movss m1, bscalem DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain %endif movsxdifnidn widthq, widthd mulss m2, m0, m0 ; nu ^ 2 mulss m3, m2, m0 ; nu ^ 3 mulss m4, m3, m0 ; nu ^ 4 xor xq, xq xor yd, yd mov strideq, widthq ; stride = width * 4 shl strideq, 2 ; w = w - ((w - 1) & 3) mov remainq, widthq sub remainq, 1 and remainq, 3 sub widthq, remainq shufps m0, m0, 0 shufps m2, m2, 0 shufps m3, m3, 0 shufps m4, m4, 0 .loop_y: xor stepd, stepd .loop_step: ; p0 *= bscale mulss m5, m1, [ptrq + xq * 4] movss [ptrq + xq * 4], m5 inc xq ; filter rightwards ; Here we are vectorizing the c version by 4 ; for (x = 1; x < width; x++) ; ptr[x] += nu * ptr[x - 1]; ; let p0 stands for ptr[x-1], the data from last loop ; and [p1,p2,p3,p4] be the vector data for this loop. ; Unrolling the loop, we get: ; p1' = p1 + p0*nu ; p2' = p2 + p1*nu + p0*nu^2 ; p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3 ; p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4 ; so we can do it in simd: ; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu + ; [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 + ; [0,0,0,p0]*nu^4 .loop_x: movu m6, [ptrq + xq * 4] ; s = [p1,p2,p3,p4] pslldq m7, m6, 4 ; [0, p1,p2,p3] movss m7, m5 ; [p0,p1,p2,p3] FMULADD_PS m6, m7, m0, m6, m8 ; s += [p0,p1,p2,p3] * nu pslldq m7, 4 ; [0,p0,p1,p2] FMULADD_PS m6, m7, m2, m6, m8 ; s += [0,p0,p1,p2] * nu^2 pslldq m7, 4 FMULADD_PS m6, m7, m3, m6, m8 ; s += [0,0,p0,p1] * nu^3 pslldq m7, 4 FMULADD_PS m6, m7, m4, m6, m8 ; s += [0,0,0,p0] * nu^4 movu [ptrq + xq * 4], m6 shufps m5, m6, m6, q3333 add xq, 4 cmp xq, widthq jl .loop_x add widthq, remainq cmp xq, widthq jge .end_scalar .loop_scalar: ; ptr[x] += nu * ptr[x-1] movss m5, [ptrq + 4*xq - 4] mulss m5, m0 addss m5, [ptrq + 4*xq] movss [ptrq + 4*xq], m5 inc xq cmp xq, widthq jl .loop_scalar .end_scalar: ; ptr[width - 1] *= bscale dec xq mulss m5, m1, [ptrq + 4*xq] movss [ptrq + 4*xq], m5 shufps m5, m5, 0 ; filter leftwards ; for (; x > 0; x--) ; ptr[x - 1] += nu * ptr[x]; ; The idea here is basically the same as filter rightwards. ; But we need to take care as the data layout is different. ; Let p0 stands for the ptr[x], which is the data from last loop. ; The way we do it in simd as below: ; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1] ; + [p-3, p-2, p-1, p0] * nu ; + [p-2, p-1, p0, 0] * nu^2 ; + [p-1, p0, 0, 0] * nu^3 ; + [p0, 0, 0, 0] * nu^4 .loop_x_back: sub xq, 4 movu m6, [ptrq + xq * 4] ; s = [p-4, p-3, p-2, p-1] psrldq m7, m6, 4 ; [p-3, p-2, p-1, 0 ] blendps m7, m5, 0x8 ; [p-3, p-2, p-1, p0 ] FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu psrldq m7, 4 ; FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0, 0] * nu^2 psrldq m7, 4 FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0, 0, 0] * nu^3 psrldq m7, 4 FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0, 0, 0, 0] * nu^4 movu [ptrq + xq * 4], m6 shufps m5, m6, m6, 0 ; m5 = [p-4', p-4', p-4', p-4'] cmp xq, remainq jg .loop_x_back cmp xq, 0 jle .end_scalar_back .loop_scalar_back: ; ptr[x-1] += nu * ptr[x] movss m5, [ptrq + 4*xq] mulss m5, m0 addss m5, [ptrq + 4*xq - 4] movss [ptrq + 4*xq - 4], m5 dec xq cmp xq, 0 jg .loop_scalar_back .end_scalar_back: ; reset aligned width for next line sub widthq, remainq inc stepd cmp stepd, stepsd jl .loop_step add ptrq, strideq inc yd cmp yd, heightd jl .loop_y RET %endif %endmacro %if ARCH_X86_64 INIT_XMM sse4 HORIZ_SLICE %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 %xdefine mmnum 16 HORIZ_SLICE %endif %if HAVE_AVX512_EXTERNAL INIT_ZMM avx512 %xdefine mmnum 32 HORIZ_SLICE %endif %endif %macro POSTSCALE_SLICE 0 cglobal postscale_slice, 2, 2, 4, ptr, length, postscale, min, max shl lengthd, 2 add ptrq, lengthq neg lengthq %if ARCH_X86_32 VBROADCASTSS m0, postscalem VBROADCASTSS m1, minm VBROADCASTSS m2, maxm %elif WIN64 VBROADCASTSS m0, xmm2 VBROADCASTSS m1, xmm3 VBROADCASTSS m2, maxm %else ; UNIX VBROADCASTSS m0, xmm0 VBROADCASTSS m1, xmm1 VBROADCASTSS m2, xmm2 %endif .loop: %if cpuflag(avx2) || cpuflag(avx512) mulps m3, m0, [ptrq + lengthq] %else movu m3, [ptrq + lengthq] mulps m3, m0 %endif maxps m3, m1 minps m3, m2 movu [ptrq+lengthq], m3 add lengthq, mmsize jl .loop RET %endmacro INIT_XMM sse POSTSCALE_SLICE %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 POSTSCALE_SLICE %endif %if HAVE_AVX512_EXTERNAL INIT_ZMM avx512 POSTSCALE_SLICE %endif ;******************************************************************************* ; void ff_verti_slice(float *buffer, int width, int height, int column_begin, ; int column_end, int steps, float nu, float bscale); ;******************************************************************************* %macro VERTI_SLICE 0 %if UNIX64 cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ steps, x, y, cwidth, step, ptr, stride %else cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ steps, nu, bscale, x, y, cwidth, step, \ ptr, stride %endif %assign cols mmsize/4 %if WIN64 VBROADCASTSS m0, num VBROADCASTSS m1, bscalem DEFINE_ARGS buffer, width, height, cbegin, cend, \ steps, x, y, cwidth, step, ptr, stride %else VBROADCASTSS m0, xmm0 ; nu VBROADCASTSS m1, xmm1 ; bscale %endif MOVSXDIFNIDN width, height, cbegin, cend, steps mov cwidthq, cendq sub cwidthq, cbeginq lea strideq, [widthq * 4] xor xq, xq ; x = 0 cmp cwidthq, cols jl .x_scalar cmp cwidthq, 0x0 je .end_scalar sub cwidthq, cols .loop_x: xor stepq, stepq .loop_step: ; ptr = buffer + x + column_begin; lea ptrq, [xq + cbeginq] lea ptrq, [bufferq + ptrq*4] ; ptr[15:0] *= bcale; movu m2, [ptrq] mulps m2, m1 movu [ptrq], m2 ; Filter downwards mov yq, 1 .loop_y_down: add ptrq, strideq ; ptrq += width movu m3, [ptrq] FMULADD_PS m2, m2, m0, m3, m2 movu [ptrq], m2 inc yq cmp yq, heightq jl .loop_y_down mulps m2, m1 movu [ptrq], m2 ; Filter upwards dec yq .loop_y_up: sub ptrq, strideq movu m3, [ptrq] FMULADD_PS m2, m2, m0, m3, m2 movu [ptrq], m2 dec yq cmp yq, 0 jg .loop_y_up inc stepq cmp stepq, stepsq jl .loop_step add xq, cols cmp xq, cwidthq jle .loop_x add cwidthq, cols cmp xq, cwidthq jge .end_scalar .x_scalar: xor stepq, stepq mov qword [rsp + 0x10], xq sub cwidthq, xq mov xq, 1 shlx cwidthq, xq, cwidthq sub cwidthq, 1 PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20 mov xq, qword [rsp + 0x10] .loop_step_scalar: lea ptrq, [xq + cbeginq] lea ptrq, [bufferq + ptrq*4] VMASKMOVPS m2, [ptrq], m4, k1 mulps m2, m1 VMASKMOVPS [ptrq], m2, m4, k1 ; Filter downwards mov yq, 1 .x_scalar_loop_y_down: add ptrq, strideq VMASKMOVPS m3, [ptrq], m4, k1 FMULADD_PS m2, m2, m0, m3, m2 VMASKMOVPS [ptrq], m2, m4, k1 inc yq cmp yq, heightq jl .x_scalar_loop_y_down mulps m2, m1 VMASKMOVPS [ptrq], m2, m4, k1 ; Filter upwards dec yq .x_scalar_loop_y_up: sub ptrq, strideq VMASKMOVPS m3, [ptrq], m4, k1 FMULADD_PS m2, m2, m0, m3, m2 VMASKMOVPS [ptrq], m2, m4, k1 dec yq cmp yq, 0 jg .x_scalar_loop_y_up inc stepq cmp stepq, stepsq jl .loop_step_scalar .end_scalar: RET %endmacro %if ARCH_X86_64 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 VERTI_SLICE %endif %if HAVE_AVX512_EXTERNAL INIT_ZMM avx512 VERTI_SLICE %endif %endif