diff options
Diffstat (limited to 'libavcodec/x86/jpeg2000dsp.asm')
-rw-r--r-- | libavcodec/x86/jpeg2000dsp.asm | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm new file mode 100644 index 0000000000..0d79ab7703 --- /dev/null +++ b/libavcodec/x86/jpeg2000dsp.asm @@ -0,0 +1,108 @@ +;****************************************************************************** +;* SIMD-optimized JPEG2000 DSP functions +;* Copyright (c) 2014 Nicolas Bertrand +;* Copyright (c) 2015 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pf_ict0: times 8 dd 1.402 +pf_ict1: times 8 dd 0.34413 +pf_ict2: times 8 dd 0.71414 +pf_ict3: times 8 dd 1.772 + +SECTION .text + +;*********************************************************************** +; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize) +;*********************************************************************** +%macro ICT_FLOAT 1 +cglobal ict_float, 4, 4, %1, src0, src1, src2, csize + shl csized, 2 + add src0q, csizeq + add src1q, csizeq + add src2q, csizeq + neg csizeq + movaps m6, [pf_ict0] + movaps m7, [pf_ict1] + %define ICT0 m6 + %define ICT1 m7 + +%if ARCH_X86_64 + movaps m8, [pf_ict2] + %define ICT2 m8 +%if cpuflag(avx) + movaps m3, [pf_ict3] + %define ICT3 m3 +%else + movaps m9, [pf_ict3] + %define ICT3 m9 +%endif + +%else ; ARCH_X86_32 + %define ICT2 [pf_ict2] +%if cpuflag(avx) + movaps m3, [pf_ict3] + %define ICT3 m3 +%else + %define ICT3 [pf_ict3] +%endif + +%endif ; ARCH + +align 16 +.loop + movaps m0, [src0q+csizeq] + movaps m1, [src1q+csizeq] + movaps m2, [src2q+csizeq] + +%if cpuflag(avx) + mulps m5, m1, ICT1 + mulps m4, m2, ICT0 + mulps m1, m1, ICT3 + mulps m2, m2, ICT2 + subps m5, m0, m5 +%else ; sse + movaps m3, m1 + movaps m4, m2 + movaps m5, m0 + mulps m3, ICT1 + mulps m4, ICT0 + mulps m1, ICT3 + mulps m2, ICT2 + subps m5, m3 +%endif + addps m4, m4, m0 + addps m0, m0, m1 + subps m5, m5, m2 + + movaps [src0q+csizeq], m4 + movaps [src2q+csizeq], m0 + movaps [src1q+csizeq], m5 + add csizeq, mmsize + jl .loop + REP_RET +%endmacro + +INIT_XMM sse +ICT_FLOAT 10 +INIT_YMM avx +ICT_FLOAT 9 |