summaryrefslogtreecommitdiff
path: root/libavcodec/x86/fft.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86/fft.asm')
-rw-r--r--libavcodec/x86/fft.asm155
1 files changed, 81 insertions, 74 deletions
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index 63e92f7159..53cfd64b3a 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -6,20 +6,20 @@
;* This algorithm (though not any of the implementation details) is
;* based on libdjbfft by D. J. Bernstein.
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -68,11 +68,12 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
ps_m1p1: dd 1<<31, 0
+cextern ps_neg
+
%assign i 16
-%rep 13
+%rep 14
cextern cos_ %+ i
%assign i i<<1
%endrep
@@ -321,6 +322,7 @@ IF%1 mova Z(1), m5
INIT_YMM avx
+%if HAVE_AVX_EXTERNAL
align 16
fft8_avx:
mova m0, Z(0)
@@ -410,6 +412,8 @@ fft32_interleave_avx:
jg .deint_loop
ret
+%endif
+
INIT_XMM sse
align 16
@@ -553,6 +557,7 @@ DEFINE_ARGS zc, w, n, o1, o3
INIT_YMM avx
+%if HAVE_AVX_EXTERNAL
DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0
@@ -563,6 +568,7 @@ cglobal fft_calc, 2,5,8
FFT_DISPATCH _interleave %+ SUFFIX, r1
REP_RET
+%endif
INIT_XMM sse
@@ -650,6 +656,68 @@ cglobal fft_permute, 2,7,1
jl .loopcopy
REP_RET
+%macro IMDCT_CALC_FUNC 0
+cglobal imdct_calc, 3,5,3
+ mov r3d, [r0 + FFTContext.mdctsize]
+ mov r4, [r0 + FFTContext.imdcthalf]
+ add r1, r3
+ PUSH r3
+ PUSH r1
+%if ARCH_X86_32
+ push r2
+ push r1
+ push r0
+%else
+ sub rsp, 8+32*WIN64 ; allocate win64 shadow space
+%endif
+ call r4
+%if ARCH_X86_32
+ add esp, 12
+%else
+ add rsp, 8+32*WIN64
+%endif
+ POP r1
+ POP r3
+ lea r0, [r1 + 2*r3]
+ mov r2, r3
+ sub r3, mmsize
+ neg r2
+ mova m2, [ps_neg]
+.loop:
+%if mmsize == 8
+ PSWAPD m0, [r1 + r3]
+ PSWAPD m1, [r0 + r2]
+ pxor m0, m2
+%else
+ mova m0, [r1 + r3]
+ mova m1, [r0 + r2]
+ shufps m0, m0, 0x1b
+ shufps m1, m1, 0x1b
+ xorps m0, m2
+%endif
+ mova [r0 + r3], m1
+ mova [r1 + r2], m0
+ sub r3, mmsize
+ add r2, mmsize
+ jl .loop
+%if cpuflag(3dnow)
+ femms
+ RET
+%else
+ REP_RET
+%endif
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+IMDCT_CALC_FUNC
+INIT_MMX 3dnowext
+IMDCT_CALC_FUNC
+%endif
+
+INIT_XMM sse
+IMDCT_CALC_FUNC
+
%if ARCH_X86_32
INIT_MMX 3dnow
%define mulps pfmul
@@ -684,7 +752,7 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%endif
%assign n 1<<%1
-%rep 17-%1
+%rep 18-%1
%assign n2 n/2
%assign n4 n/4
%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
@@ -709,9 +777,11 @@ align 8
dispatch_tab %+ fullsuffix: pointer list_of_fft
%endmacro ; DECL_FFT
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
DECL_FFT 6
DECL_FFT 6, _interleave
+%endif
INIT_XMM sse
DECL_FFT 5
DECL_FFT 5, _interleave
@@ -724,70 +794,6 @@ DECL_FFT 4
DECL_FFT 4, _interleave
%endif
-%if CONFIG_MDCT
-
-%macro IMDCT_CALC_FUNC 0
-cglobal imdct_calc, 3,5,3
- mov r3d, [r0 + FFTContext.mdctsize]
- mov r4, [r0 + FFTContext.imdcthalf]
- add r1, r3
- PUSH r3
- PUSH r1
-%if ARCH_X86_32
- push r2
- push r1
- push r0
-%else
- sub rsp, 8+32*WIN64 ; allocate win64 shadow space
-%endif
- call r4
-%if ARCH_X86_32
- add esp, 12
-%else
- add rsp, 8+32*WIN64
-%endif
- POP r1
- POP r3
- lea r0, [r1 + 2*r3]
- mov r2, r3
- sub r3, mmsize
- neg r2
- mova m2, [ps_m1m1m1m1]
-.loop:
-%if mmsize == 8
- PSWAPD m0, [r1 + r3]
- PSWAPD m1, [r0 + r2]
- pxor m0, m2
-%else
- mova m0, [r1 + r3]
- mova m1, [r0 + r2]
- shufps m0, m0, 0x1b
- shufps m1, m1, 0x1b
- xorps m0, m2
-%endif
- mova [r0 + r3], m1
- mova [r1 + r2], m0
- sub r3, mmsize
- add r2, mmsize
- jl .loop
-%if cpuflag(3dnow)
- femms
- RET
-%else
- REP_RET
-%endif
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX 3dnow
-IMDCT_CALC_FUNC
-INIT_MMX 3dnowext
-IMDCT_CALC_FUNC
-%endif
-
-INIT_XMM sse
-IMDCT_CALC_FUNC
-
INIT_XMM sse
%undef mulps
%undef addps
@@ -985,7 +991,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
sub r4, r3
%endif
%if notcpuflag(3dnowext) && mmsize == 8
- movd m7, [ps_m1m1m1m1]
+ movd m7, [ps_neg]
%endif
.pre:
%if ARCH_X86_64 == 0
@@ -1073,6 +1079,7 @@ DECL_IMDCT
%endif
INIT_YMM avx
-DECL_IMDCT
-%endif ; CONFIG_MDCT
+%if HAVE_AVX_EXTERNAL
+DECL_IMDCT
+%endif