;***************************************************************************** ;* x86-optimized functions for showcqt filter ;* ;* Copyright (C) 2016 Muhammad Faiz ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" %if ARCH_X86_64 %define pointer resq %else %define pointer resd %endif struc Coeffs .val: pointer 1 .start: resd 1 .len: resd 1 .sizeof: endstruc %macro CQT_CALC 9 ; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im ; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset mov id, xd add id, [coeffsq + Coeffs.start + %9] movaps m%5, [srcq + 8 * iq] movaps m%7, [srcq + 8 * iq + mmsize] shufps m%6, m%5, m%7, q3131 shufps m%5, m%5, m%7, q2020 sub id, fft_lend FMULADD_PS m%2, m%6, m%8, m%2, m%6 neg id FMULADD_PS m%1, m%5, m%8, m%1, m%5 movups m%5, [srcq + 8 * iq - mmsize + 8] movups m%7, [srcq + 8 * iq - 2*mmsize + 8] %if mmsize == 32 vperm2f128 m%5, m%5, m%5, 1 vperm2f128 m%7, m%7, m%7, 1 %endif shufps m%6, m%5, m%7, q1313 shufps m%5, m%5, m%7, q0202 FMULADD_PS m%4, m%6, m%8, m%4, m%6 FMULADD_PS m%3, m%5, m%8, m%3, m%5 %endmacro ; CQT_CALC %macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2 addps m%5, m%4, m%2 subps m%6, m%3, m%1 addps m%1, m%1, m%3 subps m%2, m%2, m%4 HADDPS m%5, m%6, m%3 HADDPS m%1, m%2, m%3 HADDPS m%1, m%5, m%2 %if mmsize == 32 vextractf128 xmm%2, m%1, 1 addps xmm%1, xmm%2 %endif %endmacro ; CQT_SEPARATE %macro DECLARE_CQT_CALC 0 ; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len) %if ARCH_X86_64 cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len align 16 .loop_k: mov xd, [coeffsq + Coeffs.len] xorps m0, m0, m0 movaps m1, m0 movaps m2, m0 mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] movaps m3, m0 movaps m8, m0 cmp coeffs_lend, xd movaps m9, m0 movaps m10, m0 movaps m11, m0 cmova coeffs_lend, xd xor xd, xd test coeffs_lend, coeffs_lend jz .check_loop_b mov coeffs_valq, [coeffsq + Coeffs.val] mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] align 16 .loop_ab: movaps m7, [coeffs_valq + 4 * xq] CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 movaps m7, [coeffs_val2q + 4 * xq] CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof add xd, mmsize/4 cmp xd, coeffs_lend jb .loop_ab .check_loop_b: cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] jae .check_loop_a align 16 .loop_b: movaps m7, [coeffs_val2q + 4 * xq] CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof add xd, mmsize/4 cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] jb .loop_b .loop_end: CQT_SEPARATE 0, 1, 2, 3, 4, 5 CQT_SEPARATE 8, 9, 10, 11, 4, 5 mulps xmm0, xmm0 mulps xmm8, xmm8 HADDPS xmm0, xmm8, xmm1 movaps [dstq], xmm0 sub lend, 2 lea dstq, [dstq + 16] lea coeffsq, [coeffsq + 2*Coeffs.sizeof] jnz .loop_k RET align 16 .check_loop_a: cmp xd, [coeffsq + Coeffs.len] jae .loop_end align 16 .loop_a: movaps m7, [coeffs_valq + 4 * xq] CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 add xd, mmsize/4 cmp xd, [coeffsq + Coeffs.len] jb .loop_a jmp .loop_end %else cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i %define fft_lend r4m align 16 .loop_k: mov xd, [coeffsq + Coeffs.len] xorps m0, m0, m0 movaps m1, m0 movaps m2, m0 movaps m3, m0 test xd, xd jz .store mov coeffs_valq, [coeffsq + Coeffs.val] xor xd, xd align 16 .loop_x: movaps m7, [coeffs_valq + 4 * xq] CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 add xd, mmsize/4 cmp xd, [coeffsq + Coeffs.len] jb .loop_x CQT_SEPARATE 0, 1, 2, 3, 4, 5 mulps xmm0, xmm0 HADDPS xmm0, xmm0, xmm1 .store: movlps [dstq], xmm0 sub lend, 1 lea dstq, [dstq + 8] lea coeffsq, [coeffsq + Coeffs.sizeof] jnz .loop_k RET %endif ; ARCH_X86_64 %endmacro ; DECLARE_CQT_CALC INIT_XMM sse DECLARE_CQT_CALC INIT_XMM sse3 DECLARE_CQT_CALC %if HAVE_AVX_EXTERNAL INIT_YMM avx DECLARE_CQT_CALC %endif %if HAVE_FMA3_EXTERNAL INIT_YMM fma3 DECLARE_CQT_CALC %endif %if HAVE_FMA4_EXTERNAL INIT_XMM fma4 DECLARE_CQT_CALC %endif