diff options
author | Andras Becsi <andras.becsi@digia.com> | 2014-03-18 13:16:26 +0100 |
---|---|---|
committer | Frederik Gladhorn <frederik.gladhorn@digia.com> | 2014-03-20 15:55:39 +0100 |
commit | 3f0f86b0caed75241fa71c95a5d73bc0164348c5 (patch) | |
tree | 92b9fb00f2e9e90b0be2262093876d4f43b6cd13 /chromium/third_party/openmax_dl | |
parent | e90d7c4b152c56919d963987e2503f9909a666d2 (diff) | |
download | qtwebengine-chromium-3f0f86b0caed75241fa71c95a5d73bc0164348c5.tar.gz |
Update to new stable branch 1750
This also includes an updated ninja and chromium dependencies
needed on Windows.
Change-Id: Icd597d80ed3fa4425933c9f1334c3c2e31291c42
Reviewed-by: Zoltan Arvai <zarvai@inf.u-szeged.hu>
Reviewed-by: Zeno Albisser <zeno.albisser@digia.com>
Diffstat (limited to 'chromium/third_party/openmax_dl')
99 files changed, 7826 insertions, 276 deletions
diff --git a/chromium/third_party/openmax_dl/dl/api/armCOMM_s.h b/chromium/third_party/openmax_dl/dl/api/arm/armCOMM_s.h index 6b0d2be66a2..6ce1e2fc6a3 100644 --- a/chromium/third_party/openmax_dl/dl/api/armCOMM_s.h +++ b/chromium/third_party/openmax_dl/dl/api/arm/armCOMM_s.h @@ -371,6 +371,17 @@ .endm + @// Allocate 8-byte aligned area of name + @// |name| and size |size| bytes. + .macro M_ALLOC8 name, size + .if (_SBytes & 7) != 0 + .set _SBytes, _SBytes + (8 - (_SBytes & 7)) + .endif + .set \name\()_F, _SBytes + .set _SBytes, _SBytes + \size + + .endm + @ Load word from stack .macro M_LDR r, a0, a1, a2, a3 _M_DATA "ldr", 4, \r, \a0, \a1, \a2, \a3 @@ -381,6 +392,16 @@ _M_DATA "str", 4, \r, \a0, \a1, \a2, \a3 .endm + @ Load double word from stack + .macro M_LDRD r0, r1, a0, a1, a2, a3 + _M_DATA2 "ldrd", 8, \r0, \r1, \a0, \a1, \a2, \a3 + .endm + + @ Store double word to stack + .macro M_STRD r0, r1, a0, a1, a2, a3 + _M_DATA2 "strd", 8, \r0, \r1, \a0, \a1, \a2, \a3 + .endm + @ Macro to perform a data access operation @ Such as LDR or STR @ The addressing mode is modified such that @@ -407,3 +428,31 @@ .set _Offset, _Workspace + \a0\()_F \i\a1 \r, [sp, #_Offset] .endm + + @ Macro to perform a data access operation + @ Such as LDR or STR + @ The addressing mode is modified such that + @ 1. If no address is given then the name is taken + @ as a stack offset + @ 2. If the addressing mode is not available for the + @ state being assembled for (eg Thumb) then a suitable + @ addressing mode is substituted. + @ + @ On Entry: + @ $i = Instruction to perform (eg "LDRB") + @ $a = Required byte alignment + @ $r = Register(s) to transfer (eg "r1") + @ $a0,$a1,$a2. Addressing mode and condition. One of: + @ label {,cc} + @ [base] {,,,cc} + @ [base, offset]{!} {,,cc} + @ [base, offset, shift]{!} {,cc} + @ [base], offset {,,cc} + @ [base], offset, shift {,cc} + @ + @ WARNING: Most of the above are not supported, except the first case. + .macro _M_DATA2 i, a, r0, r1, a0, a1, a2, a3 + .set _Offset, _Workspace + \a0\()_F + \i\a1 \r0, \r1, [sp, #_Offset] + .endm +
\ No newline at end of file diff --git a/chromium/third_party/openmax_dl/dl/api/armOMX.h b/chromium/third_party/openmax_dl/dl/api/arm/armOMX.h index 0ad21c42ce2..0ad21c42ce2 100644 --- a/chromium/third_party/openmax_dl/dl/api/armOMX.h +++ b/chromium/third_party/openmax_dl/dl/api/arm/armOMX.h diff --git a/chromium/third_party/openmax_dl/dl/api/omxtypes_s.h b/chromium/third_party/openmax_dl/dl/api/arm/omxtypes_s.h index d880d351fd5..d880d351fd5 100644 --- a/chromium/third_party/openmax_dl/dl/api/omxtypes_s.h +++ b/chromium/third_party/openmax_dl/dl/api/arm/omxtypes_s.h diff --git a/chromium/third_party/openmax_dl/dl/dl.gyp b/chromium/third_party/openmax_dl/dl/dl.gyp index 0573ce25631..61a05b007d9 100644 --- a/chromium/third_party/openmax_dl/dl/dl.gyp +++ b/chromium/third_party/openmax_dl/dl/dl.gyp @@ -18,79 +18,10 @@ 'include_dirs': [ '../', ], - 'cflags!': [ - '-mfpu=vfpv3-d16', - ], - 'cflags': [ - # We enable Neon instructions even with arm_neon==0, to support - # runtime detection. - '-mfpu=neon', - ], 'sources': [ - 'api/armCOMM_s.h', - 'api/armOMX.h', 'api/omxtypes.h', - 'api/omxtypes_s.h', - 'sp/api/armSP.h', 'sp/api/omxSP.h', - # Complex 32-bit fixed-point FFT. - 'sp/src/armSP_FFT_S32TwiddleTable.c', - 'sp/src/omxSP_FFTGetBufSize_C_SC32.c', - 'sp/src/omxSP_FFTInit_C_SC32.c', - 'sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S', - 'sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S', - 'sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S', - # Real 32-bit fixed-point FFT - 'sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S', - 'sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S', - 'sp/src/omxSP_FFTGetBufSize_R_S32.c', - 'sp/src/omxSP_FFTInit_R_S32.c', - 'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S', - # Complex 16-bit fixed-point FFT - 'sp/src/omxSP_FFTInit_C_SC16.c', - 'sp/src/omxSP_FFTGetBufSize_C_SC16.c', - 'sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S', - 'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S', - 'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S', - # Real 16-bit fixed-point FFT - 'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S', - 'sp/src/omxSP_FFTGetBufSize_R_S16S32.c', - 'sp/src/omxSP_FFTInit_R_S16S32.c', - 'sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S', - # Complex floating-point FFT - 'sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S', - 'sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S', 'sp/src/armSP_FFT_F32TwiddleTable.c', - 'sp/src/omxSP_FFTGetBufSize_C_FC32.c', - 'sp/src/omxSP_FFTInit_C_FC32.c', - 'sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S', - 'sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S', - # Real floating-point FFT - 'sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S', - 'sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S', - 'sp/src/omxSP_FFTGetBufSize_R_F32.c', - 'sp/src/omxSP_FFTInit_R_F32.c', - 'sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S', ], 'conditions' : [ ['big_float_fft == 1', { @@ -98,6 +29,166 @@ 'BIG_FFT_TABLE', ], }], + ['target_arch=="arm"', { + 'cflags!': [ + '-mfpu=vfpv3-d16', + ], + 'cflags': [ + # We enable Neon instructions even with arm_neon==0, to support + # runtime detection. + '-mfpu=neon', + ], + 'dependencies': [ + '<(android_ndk_root)/android_tools_ndk.gyp:cpu_features', + 'openmax_dl_armv7', + ], + 'link_settings' : { + 'libraries': [ + # To get the __android_log_print routine + '-llog', + ], + }, + 'sources': [ + # Common files that are used by both the NEON and non-NEON code. + 'api/armCOMM_s.h', + 'api/armOMX.h', + 'api/omxtypes_s.h', + 'sp/api/armSP.h', + 'sp/src/arm/armSP_FFT_S32TwiddleTable.c', + 'sp/src/arm/detect.c', + 'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c', + 'sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c', + 'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c', + 'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c', + 'sp/src/arm/omxSP_FFTGetBufSize_R_S16.c', + 'sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c', + 'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c', + 'sp/src/arm/omxSP_FFTInit_C_FC32.c', + 'sp/src/arm/omxSP_FFTInit_C_SC16.c', + 'sp/src/arm/omxSP_FFTInit_C_SC32.c', + 'sp/src/arm/omxSP_FFTInit_R_F32.c', + 'sp/src/arm/omxSP_FFTInit_R_S16.c', + 'sp/src/arm/omxSP_FFTInit_R_S16S32.c', + 'sp/src/arm/omxSP_FFTInit_R_S32.c', + + # Complex 32-bit fixed-point FFT. + 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S', + 'sp/src/arm/neon/omxSP_FFTInv_CToC_SC32_Sfs_s.S', + 'sp/src/arm/neon/omxSP_FFTFwd_CToC_SC32_Sfs_s.S', + # Real 32-bit fixed-point FFT + 'sp/src/arm/neon/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S', + 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S', + 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32_Sfs_s.S', + # Complex 16-bit fixed-point FFT + 'sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S', + 'sp/src/arm/neon/omxSP_FFTFwd_CToC_SC16_Sfs_s.S', + 'sp/src/arm/neon/omxSP_FFTInv_CToC_SC16_Sfs_s.S', + # Real 16-bit fixed-point FFT + 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S', + 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S', + 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S', + 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S', + # Complex floating-point FFT + 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S', + 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S', + 'sp/src/arm/neon/omxSP_FFTInv_CToC_FC32_Sfs_s.S', + 'sp/src/arm/neon/omxSP_FFTFwd_CToC_FC32_Sfs_s.S', + # Real floating-point FFT + 'sp/src/arm/neon/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S', + 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S', + 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_F32_Sfs_s.S', + ], + }], + ['target_arch=="ia32" or target_arch=="x64"', { + 'cflags': [ + '-msse2', + ], + 'sources': [ + # Real 32-bit floating-point FFT. + 'sp/api/x86SP.h', + 'sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c', + 'sp/src/x86/omxSP_FFTGetBufSize_R_F32.c', + 'sp/src/x86/omxSP_FFTInit_R_F32.c', + 'sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c', + 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c', + 'sp/src/x86/x86SP_FFT_F32_radix2_kernel.c', + 'sp/src/x86/x86SP_FFT_F32_radix4_kernel.c', + 'sp/src/x86/x86SP_SSE_Math.h', + ], + }], + ], + }, + ], + 'conditions': [ + ['target_arch=="arm"', { + 'targets': [ + { + # Non-NEON implementation of FFT. This library is NOT + # standalone. Applications must link with openmax_dl. + 'target_name': 'openmax_dl_armv7', + 'type': 'static_library', + 'include_dirs': [ + '../', + ], + 'cflags!': [ + '-mfpu=neon', + ], + 'sources': [ + # Complex floating-point FFT + 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S', + 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S', + 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S', + 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S', + 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S', + 'sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S', + 'sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S', + # Real floating-point FFT + 'sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S', + 'sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S', + 'sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S', + ], + }, ], - }] + }], + ], } diff --git a/chromium/third_party/openmax_dl/dl/sp/api/armSP.h b/chromium/third_party/openmax_dl/dl/sp/api/armSP.h index f615a87c7ab..4972f09c554 100644 --- a/chromium/third_party/openmax_dl/dl/sp/api/armSP.h +++ b/chromium/third_party/openmax_dl/dl/sp/api/armSP.h @@ -64,6 +64,14 @@ typedef struct ARMsFFTSpec_R_SC32_Tag OMX_S32 *pBuf; }ARMsFFTSpec_R_SC32; +typedef struct ARMsFFTSpec_R_SC16_Tag +{ + OMX_U32 N; + OMX_U16 *pBitRev; + OMX_SC16 *pTwiddle; + OMX_S16 *pBuf; +} ARMsFFTSpec_R_SC16; + typedef struct ARMsFFTSpec_R_FC32_Tag { OMX_U32 N; diff --git a/chromium/third_party/openmax_dl/dl/sp/api/omxSP.h b/chromium/third_party/openmax_dl/dl/sp/api/omxSP.h index 3016c772f73..5a7980ad452 100644 --- a/chromium/third_party/openmax_dl/dl/sp/api/omxSP.h +++ b/chromium/third_party/openmax_dl/dl/sp/api/omxSP.h @@ -44,6 +44,7 @@ extern "C" { typedef void OMXFFTSpec_C_SC16; typedef void OMXFFTSpec_C_SC32; typedef void OMXFFTSpec_R_S16S32; + typedef void OMXFFTSpec_R_S16; typedef void OMXFFTSpec_R_S32; typedef void OMXFFTSpec_R_F32; typedef void OMXFFTSpec_C_FC32; @@ -1423,7 +1424,7 @@ OMXResult omxSP_FFTInit_C_SC32 ( * Input Arguments: * * order - base-2 logarithm of the desired block length; valid in the range - * [0,12] + * [1,15] * * Output Arguments: * @@ -1436,7 +1437,7 @@ OMXResult omxSP_FFTInit_C_SC32 ( * following is true: * - pFFTSpec is either NULL or violates the 8-byte alignment * restrictions - * - order < 0 or order > 12 + * - order < 1 or order > 15 * */ OMXResult omxSP_FFTInit_C_FC32( @@ -1487,6 +1488,45 @@ OMXResult omxSP_FFTInit_R_S16S32( /** + * Function: omxSP_FFTInit_R_S16 + * + * Description: + * These functions initialize specification structures required for the real + * FFT and IFFT functions. The function <FFTInit_R_S16> is used + * to initialize the specification structures for functions + * <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>. + * + * Memory for *pFFTFwdSpec must be allocated before calling these functions + * and should be 8-byte aligned. + * + * The number of bytes required for *pFFTFwdSpec can be + * determined using <FFTGetBufSize_R_S16>. + * + * Input Arguments: + * + * order - base-2 logarithm of the desired block length; valid in the range + * [1,12] + * + * Output Arguments: + * + * pFFTFwdSpec - pointer to the initialized specification structure + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if one or more of the + * following is true: + * - pFFTFwdSpec is either NULL or violates the 8-byte alignment + * restrictions + * - order < 1 or order > 12 + * + */ +OMXResult omxSP_FFTInit_R_S16 ( + OMXFFTSpec_R_S32*pFFTFwdSpec, + OMX_INT order +); + +/** * Function: omxSP_FFTInit_R_S32 (2.2.4.1.4) * * Description: @@ -1543,7 +1583,7 @@ OMXResult omxSP_FFTInit_R_S32 ( * Input Arguments: * * order - base-2 logarithm of the desired block length; valid in the range - * [0,12] + * [1,15] * * Output Arguments: * @@ -1556,7 +1596,7 @@ OMXResult omxSP_FFTInit_R_S32 ( * following is true: * - pFFTFwdSpec is either NULL or violates the 8-byte alignment * restrictions - * - order < 0 or order > 12 + * - order < 1 or order > 15 * */ OMXResult omxSP_FFTInit_R_F32( @@ -1644,7 +1684,7 @@ OMXResult omxSP_FFTGetBufSize_C_SC32 ( * Input Arguments: * * order - base-2 logarithm of the desired block length; valid in the range - * [0,12] + * [1,15] * * Output Arguments: * @@ -1657,7 +1697,7 @@ OMXResult omxSP_FFTGetBufSize_C_SC32 ( * OMX_Sts_BadArgErr - bad arguments; returned if one or more of the * following is true: * - pSize is NULL - * - order < 0 or order > 12 + * - order < 1 or order > 15 * */ OMXResult omxSP_FFTGetBufSize_C_FC32( @@ -1699,6 +1739,38 @@ OMXResult omxSP_FFTGetBufSize_R_S16S32( ); +/** + * Function: omxSP_FFTGetBufSize_R_S16 + * + * Description: + * These functions compute the size of the specification structure + * required for the length 2^order real FFT and IFFT functions. The function + * <FFTGetBufSize_R_S16> is used in conjunction with the 16-bit + * functions <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>. + * + * Input Arguments: + * + * order - base-2 logarithm of the length; valid in the range + * [1,12] + * + * Output Arguments: + * + * pSize - pointer to the number of bytes required for the specification + * structure + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments The function returns + * OMX_Sts_BadArgErr if one or more of the following is true: + * pSize is NULL + * order < 1 or order > 12 + * + */ +OMXResult omxSP_FFTGetBufSize_R_S16 ( + OMX_INT order, + OMX_INT *pSize +); /** * Function: omxSP_FFTGetBufSize_R_S32 (2.2.4.1.8) @@ -1743,7 +1815,7 @@ OMXResult omxSP_FFTGetBufSize_R_S32 ( * * Input Arguments: * - * order - base-2 logarithm of the length; valid in the range [0,12] + * order - base-2 logarithm of the length; valid in the range [1,15] * * Output Arguments: * @@ -1756,7 +1828,7 @@ OMXResult omxSP_FFTGetBufSize_R_S32 ( * OMX_Sts_BadArgErr - bad arguments The function returns * OMX_Sts_BadArgErr if one or more of the following is true: * pSize is NULL - * order < 0 or order > 12 + * order < 1 or order > 15 * */ OMXResult omxSP_FFTGetBufSize_R_F32( @@ -1886,8 +1958,7 @@ OMXResult omxSP_FFTFwd_CToC_SC32_Sfs ( * must be aligned on a 32-byte boundary. * pFFTSpec - pointer to the preallocated and initialized specification * structure - * scaleFactor - scale factor of the output. Valid value is 0 - * only. + * scaleFactor - scale factor of the output. Valid range is [0,16]. * * Output Arguments: * order @@ -2024,6 +2095,59 @@ OMXResult omxSP_FFTFwd_RToCCS_S16S32_Sfs ( ); +/** + * Function: omxSP_FFTFwd_RToCCS_S16_Sfs + * + * Description: + * These functions compute an FFT for a real-valued signal of length of 2^order, + * where 0 < order <= 12. Transform length is determined by the + * specification structure, which must be initialized prior to calling the FFT + * function using the appropriate helper, i.e., <FFTInit_R_S16>. + * The relationship between the input and output sequences can + * be expressed in terms of the DFT, i.e.: + * + * x[n] = (2^(-scalefactor)/N) . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N) + * n=0,1,2,...N-1 + * N=2^order. + * + * The conjugate-symmetric output sequence is represented using a CCS vector, + * which is of length N+2, and is organized as follows: + * + * Index: 0 1 2 3 4 5 . . . N-2 N-1 N N+1 + * Component: R0 0 R1 I1 R2 I2 . . . R[N/2-1] I[N/2-1] R[N/2] 0 + * + * where R[n] and I[n], respectively, denote the real and imaginary components + * for FFT bin 'n'. Bins are numbered from 0 to N/2, where N is the FFT length. + * Bin index 0 corresponds to the DC component, and bin index N/2 corresponds to + * the foldover frequency. + * + * Input Arguments: + * pSrc - pointer to the real-valued input sequence, of length 2^order; + * must be aligned on a 32-byte boundary. + * pFFTSpec - pointer to the preallocated and initialized specification + * structure + * scaleFactor - output scale factor; valid range is [0, 16] + * + * Output Arguments: + * pDst - pointer to output sequence, represented using CCS format, of + * length (2^order)+2; must be aligned on a 32-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments, if one or more of followings is true: + * - one of the pointers pSrc, pDst, or pFFTSpec is NULL + * - pSrc or pDst is not aligned on a 32-byte boundary + * - scaleFactor<0 or scaleFactor >16 + * + */ +OMXResult omxSP_FFTFwd_RToCCS_S16_Sfs ( + const OMX_S16* pSrc, + OMX_S16* pDst, + const OMXFFTSpec_R_S16* pFFTSpec, + OMX_INT scaleFactor +); + /** * Function: omxSP_FFTFwd_RToCCS_S32_Sfs (2.2.4.4.2) @@ -2129,7 +2253,29 @@ OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs( const OMXFFTSpec_R_F32* pFFTSpec ); +#ifdef __arm__ +/* + * Non-NEON version of omxSP_FFTFwd_RToCCS_F32_Sfs + */ +OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs_vfp( + const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec +); +/* + * Just like omxSP_FFTFwd_RToCCS_F32_Sfs, but automatically detects + * whether NEON is available or not and chooses the appropriate + * routine. + */ +extern OMXResult (*omxSP_FFTFwd_RToCCS_F32)( + const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec +); +#else +#define omxSP_FFTFwd_RToCCS_F32 omxSP_FFTFwd_RToCCS_F32_Sfs +#endif /** * Function: omxSP_FFTInv_CCSToR_S32S16_Sfs (2.2.4.4.4) @@ -2179,6 +2325,53 @@ OMXResult omxSP_FFTInv_CCSToR_S32S16_Sfs ( ); +/** + * Function: omxSP_FFTInv_CCSToR_S16_Sfs + * + * Description: + * These functions compute the inverse FFT for a conjugate-symmetric input + * sequence. Transform length is determined by the specification structure, + * which must be initialized prior to calling the FFT function using + * <FFTInit_R_S16>. For a transform of length M, the input + * sequence is represented using a packed CCS vector of length + * M+2, and is organized as follows: + * + * Index: 0 1 2 3 4 5 . . . M-2 M-1 M M+1 + * Component R[0] 0 R[1] I[1] R[2] I[2] . . . R[M/2-1] I[M/2-1] R[M/2] 0 + * + * where R[n] and I[n], respectively, denote the real and imaginary components + * for FFT bin n. + * Bins are numbered from 0 to M/2, where M is the FFT length. Bin index 0 + * corresponds to the DC component, and bin index M/2 corresponds to the + * foldover frequency. + * + * Input Arguments: + * pSrc - pointer to the complex-valued input sequence represented using + * CCS format, of length (2^order) + 2; must be aligned on a 32-byte + * boundary. + * pFFTSpec - pointer to the preallocated and initialized specification + * structure + * scaleFactor - output scalefactor; range is [0,16] + * + * Output Arguments: + * pDst - pointer to the real-valued output sequence, of length 2^order ; must + * be aligned on a 32-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments if one or more of the following is true: + * - pSrc, pDst, or pFFTSpec is NULL + * - pSrc or pDst is not aligned on a 32-byte boundary + * - scaleFactor<0 or scaleFactor >16 + * + */ +OMXResult omxSP_FFTInv_CCSToR_S16_Sfs ( + const OMX_S16* pSrc, + OMX_S16* pDst, + const OMXFFTSpec_R_S16* pFFTSpec, + OMX_INT scaleFactor +); /** * Function: omxSP_FFTInv_CCSToR_S32_Sfs (2.2.4.4.4) @@ -2274,7 +2467,28 @@ OMXResult omxSP_FFTInv_CCSToR_F32_Sfs( const OMXFFTSpec_R_F32* pFFTSpec ); +#ifdef __arm__ +/* + * Non-NEON version of omxSP_FFTInv_CCSToR_F32_Sfs + */ +OMXResult omxSP_FFTInv_CCSToR_F32_Sfs_vfp( + const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec +); +/* + * Just like omxSP_FFTInv_CCSToR_F32_Sfs, but automatically detects + * whether NEON is available or not and chooses the appropriate + * routine. + */ +extern OMXResult (*omxSP_FFTInv_CCSToR_F32)( + const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec); +#else +#define omxSP_FFTInv_CCSToR_F32 omxSP_FFTInv_CCSToR_F32_Sfs +#endif #ifdef __cplusplus } diff --git a/chromium/third_party/openmax_dl/dl/sp/api/x86SP.h b/chromium/third_party/openmax_dl/dl/sp/api/x86SP.h new file mode 100644 index 00000000000..53127343b75 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/api/x86SP.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2007-2008 ARM Limited. All Rights Reserved. + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * It has been relicensed with permission from the copyright holders. + */ + +#ifndef _x86SP_H_ +#define _x86SP_H_ + +#include "dl/api/omxtypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern OMX_F32 armSP_FFT_F32TwiddleTable[]; + +typedef struct X86FFTSpec_R_FC32_Tag +{ + OMX_U32 N; + OMX_F32* pTwiddle; + // Ping Pong buffer for doing the N/2 point complex FFT. + OMX_F32* pBuf1; + OMX_F32* pBuf2; + +} X86FFTSpec_R_FC32; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_S32TwiddleTable.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/armSP_FFT_S32TwiddleTable.c index a0db0575b50..a0db0575b50 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_S32TwiddleTable.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armSP_FFT_S32TwiddleTable.c diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S new file mode 100644 index 00000000000..75d6711cd64 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S @@ -0,0 +1,260 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of +@// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S to support float +@// instead of SC32. +@// + +@// +@// Description: +@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT +@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation +@// It implements the "scaled"(by 1/2) version of the above formula. +@// +@// + + +@// Include standard headers + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@// M_VARIANTS ARM1136JS + +@// Import symbols required from other files +@// (For example tables) + + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + +@/ IF ARM1136JS + +@//Input Registers + +#define pSrc r0 +#define pDst r1 +#define pFFTSpec r2 + + +@// Output registers +#define result r0 + +@//Local Scratch Registers + + +#define argTwiddle r1 +#define argDst r2 +#define argScale r4 +#define pTwiddle r4 +#define pOut r5 +#define subFFTSize r7 +#define subFFTNum r6 +#define N r6 +#define order r14 +#define diff r9 +#define count r8 +#define diffMinusOne r2 +#define round r3 + +#define pOut1 r2 +#define size r7 +#define step r3 +#define step1 r6 +#define twStep r12 +#define pTwiddleTmp r14 +#define t0 r12 + +#define x0r s0 +#define x0i s1 +#define x1r s2 +#define x1i s3 +#define w0r s4 +#define w0i s5 +#define y0r s6 +#define y0i s7 +#define w1r s6 +#define w1i s7 +#define y1r s6 /*@// w1r,w1i*/ +#define y1i s7 +#define st0 s8 +#define st1 s9 +#define st2 s10 +#define st3 s11 +#define st4 s12 +#define st5 s13 +//@ half = 0.5 +#define half s15 + + + + + + .MACRO FFTSTAGE scaled, inverse,name + + @// Initialize half now. + movw N, #0x0000 + movt N, #0x3f00 + vmov.f32 half, N @// half = 0.5 + + @// Read the size from structure and take log + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + + @// Read other structure parameters + LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] + LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] + + + MOV size,N,ASR #1 @// preserve the contents of N + + MOV step,size,LSL #3 @// step = N/2 * 8 bytes + ADD pTwiddleTmp,pTwiddle,#8 @// W^2 + + ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes + @// twStep = 3N/8 * 8 bytes pointing to W^1 + SUB twStep,step,size,LSL #1 + MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes + SUB step1,step1,#8 @// (N/4-1)*8 bytes + ADD argTwiddle,pTwiddle,twStep @// W^1 + + @// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} + @// Note: W^(k) is stored as negated value and also need to + @// conjugate the values from the table + + @// Z(0) : no need of twiddle multiply + @// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } + + + add pSrc, step @// step = N/2*8 bytes + vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step] + sub pSrc, step + vldm.f32 pSrc!, {x0r, x0i} + + SUBS size,size,#2 + + vadd.f32 st0, x0r, x1r @// a+c + vsub.f32 st1, x0r, x1r @// a-c + vmov.f32 x0r, st0 + vmov.f32 x1r, st1 + vsub.f32 st0, x0i, x1i @// b-d + vadd.f32 x1i, x0i, x1i @// b+d + vmov.f32 x0i, st0 + + + vsub.f32 x0r,x0r,x1i @// Z(0).r + vadd.f32 x0i,x0i,x1r @// Z(0).i + + vmul.f32 x0r, half + vmul.f32 x0i, half + vstm.f32 pOut1!, {x0r, x0i} @// pOut1 = pOut+ N/2*8 bytes + + BLT end\name + BEQ lastElement\name + + ASR size,size,#1 +evenOddButterflyLoop\name: + + SUB step,step,#16 @// (N/2-2)*8 bytes + + add pSrc, step @// (N/2-1)*8 bytes + vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step] + sub pSrc, step + vldm.f32 pSrc!, {x0r, x0i} + add argTwiddle, step1 + vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step] + sub argTwiddle, step1 + vldm.f32 argTwiddle!, {w0r, w0i} + + SUB step1,step1,#8 + SUBS size,size,#1 + + + vsub.f32 st2,x0r,x1r @// a-c + vadd.f32 st3,x0i,x1i @// b+d + vadd.f32 st0,x0r,x1r @// a+c + vsub.f32 st1,x0i,x1i @// b-d + + vmul.f32 x1r,w1r,st2 + vmul.f32 x1i,w1r,st3 + vmls.f32 x1r,w1i,st3 + vmla.f32 x1i,w1i,st2 + + vadd.f32 y1r,st0,x1i @// F(N/2 -1) + vsub.f32 y1i,x1r,st1 @// y1r,y1i same as w1r, w1i + + + vmul.f32 x0r,w0r,st2 + vmul.f32 x0i,w0r,st3 + vmla.f32 x0r,w0i,st3 + vmls.f32 x0i,w0i,st2 + + + vadd.f32 st4,st0,x0i @// F(1) + vsub.f32 st5,st1,x0r + + + vmul.f32 y1r, half + vmul.f32 y1i, half + vmul.f32 st4, half + vmul.f32 st5, half + add pOut1, step @// (N/2-1)*8 bytes + vstm.f32 pOut1, {y1r, y1i} @// {y1r,y1i} = [pOut1, step] + sub pOut1, step + vstm.f32 pOut1!, {st4, st5} + + MOV t0,argTwiddle @// swap ptr for even and odd twiddles + MOV argTwiddle,pTwiddleTmp + MOV pTwiddleTmp,t0 + + BGT evenOddButterflyLoop\name + + + @// Last element can be expanded as follows + @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] + @// (since W^k is stored as -ve) + @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] + @// 1/2[2a+j0] + j (c-jd) [0+j2b] + @// (a+bc, -bd) + @// Since (c,d) = (0,1) for the last element, result is just (a,-b) + +lastElement\name: + vldm.f32 pSrc, {x0r, x0i} + + vneg.f32 x0i, x0i + vstm.f32 pOut1, {x0r, x0i} +end\name: + + + .endm + + +@ Structure offsets for FFTSpec + .set ARMsFFTSpec_N, 0 + .set ARMsFFTSpec_pBitRev, 4 + .set ARMsFFTSpec_pTwiddle, 8 + .set ARMsFFTSpec_pBuf, 12 + + + M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp,r4 + FFTSTAGE "FALSE","TRUE",Inv + M_END + +@// ENDIF @//ARM1136JS + + + @// Guarding implementation by the processor name + + + + .end diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S new file mode 100644 index 00000000000..c2feb0bc758 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S @@ -0,0 +1,145 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S +@// to support float instead of SC32. +@// + +@// +@// Description: +@// Compute a Radix 2 FFT stage for a N point complex signal +@// +@// + + +@// Include standard headers + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@// M_VARIANTS ARM1136JS + +@// Import symbols required from other files +@// (For example tables) + + + + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + +@// IF ARM1136JS + +@//Input Registers + +#define pSrc r0 +#define pDst r2 +#define pTwiddle r1 +#define pPingPongBuf r5 +#define subFFTNum r6 +#define subFFTSize r7 + + +@//Output Registers + + +@//Local Scratch Registers + +#define pDstBuf r3 /*@// Temporarily hold pingpong buffer ptr*/ +#define grpSize r14 +#define outPointStep r12 +#define setCount r14 +#define pointStep r12 + +@// Real and Imaginary parts +#define x0r s0 +#define x0i s1 +#define x1r s2 +#define x1i s3 +#define y1r s4 +#define y1i s5 +#define y0r s6 +#define y0i s7 + + + + .MACRO FFTSTAGE scaled, inverse, name + + @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount + @// and pGrpSize regs + + mov subFFTSize, #2 + lsr grpSize, subFFTNum, #1 + mov subFFTNum, grpSize + + @// pT0+1 increments pT0 by 8 bytes + @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes + @// Note: outPointStep = pointStep for firststage + @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount) + MOV pointStep,grpSize,LSL #3 + + + + @// Loop on the sets for grp zero + +grpZeroSetLoop\name: + + add pSrc, pSrc, pointStep + @// {x1r,x1i} = [pSrc, pointStep] + vldm.f32 pSrc, {x1r, x1i} + sub pSrc, pSrc, pointStep + vldm.f32 pSrc!, {x0r, x0i} + + SUBS setCount,setCount,#1 @// decrement the loop counter + + + + vsub.f32 y1r,x0r,x1r + vsub.f32 y1i,x0i,x1i + + vadd.f32 y0r,x0r,x1r + vadd.f32 y0i,x0i,x1i + + add pDst, pDst, outPointStep + @// {y1r,y1i} -> [pDst, outPointStep] + vstm pDst, {y1r, y1i} + sub pDst, pDst, outPointStep + vstm pDst!, {y0r, y0i} + + BGT grpZeroSetLoop\name + + + @// reset pSrc to pDst for the next stage + SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize + mov pDst, pPingPongBuf + + .endm + + + M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp,r4 + FFTSTAGE "FALSE","FALSE",FWD + M_END + + M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp,r4 + FFTSTAGE "FALSE","TRUE",INV + M_END + + +@/ ENDIF @//ARM1136JS + + +@// Guarding implementation by the processor name + + + + .end diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S new file mode 100644 index 00000000000..3bd47252f1e --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S @@ -0,0 +1,213 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S +@// to support float instead of SC32. +@// + +@// +@// Description: +@// Compute a first stage Radix 4 FFT stage for a N point complex signal +@// +@// + + +@// Include standard headers + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@// M_VARIANTS ARM1136JS + +@// Import symbols required from other files +@// (For example tables) + + + + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + +@// IF ARM1136JS + +@//Input Registers + +#define pSrc r0 +#define pDst r2 +#define pTwiddle r1 +#define pPingPongBuf r5 +#define subFFTNum r6 +#define subFFTSize r7 + + +@//Output Registers + + +@//Local Scratch Registers + +#define grpSize r14 +#define outPointStep r12 +#define setStep r3 +#define setCount r14 /*@// Reuse grpSize as setCount*/ +#define pointStep r12 + +@// Real and Imaginary parts +#define x0r s0 +#define x0i s1 +#define x1r s2 +#define x1i s3 +#define x2r s4 +#define x2i s5 +#define x3r s6 +#define x3i s7 +#define t3r s0 /*@// Temporarily hold x3r and x3i*/ +#define t3i s1 +#define sr s8 +#define si s9 + + + + .MACRO FFTSTAGE scaled, inverse, name + + @// Define stack arguments + + + @// Update grpCount and grpSize rightaway inorder to reuse + @// pSubFFTSize and pSubFFTNum regs + mov subFFTSize, #4 + lsr grpSize, subFFTNum, #2 + mov subFFTNum, grpSize + + + @// pT0+1 increments pT0 by 8 bytes + @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes + @// Note: outPointStep = pointStep for firststage + @// Note: setCount = grpSize/4 (reuse the updated grpSize for setCount) + MOV pointStep,grpSize,LSL #3 + + + @// Calculate the step of input data for the next set + @//MOV setStep,pointStep,LSL #1 + MOV setStep,grpSize,LSL #4 + @// setStep = 3*pointStep + ADD setStep,setStep,pointStep + @// setStep = - 3*pointStep+8 + RSB setStep,setStep,#8 + + @// grp = 0 a special case since all the twiddle factors are 1 + @// Loop on the sets + +grpZeroSetLoop\name: + + vldm.f32 pSrc, {x0r, x0i} + add pSrc, pSrc, pointStep + vldm.f32 pSrc, {x1r, x1i} + add pSrc, pSrc, pointStep + vldm.f32 pSrc, {x2r, x2i} + add pSrc, pSrc, pointStep + vldm.f32 pSrc, {x3r, x3i} + add pSrc, pSrc, setStep + + + @// Decrement setcount + SUBS setCount,setCount,#1 + + + + @// finish first stage of 4 point FFT + + vadd.f32 x0r,x0r,x2r @// x0 = x0 + x2 + vadd.f32 x0i,x0i,x2i + + vadd.f32 sr, x2r, x2r + vadd.f32 si, x2i, x2i + vsub.f32 x2r,x0r,sr @// x2 = x0 - x2 + vsub.f32 x2i,x0i,si + + vadd.f32 x1r,x1r,x3r @// x1 = x1 + x3 + vadd.f32 x1i,x1i,x3i + + vadd.f32 sr, x3r, x3r + vadd.f32 si, x3i, x3i + vsub.f32 x3r,x1r,sr @// x3 = x1 - x3 + vsub.f32 x3i,x1i,si + + + @// finish second stage of 4 point FFT + + + vadd.f32 x0r,x0r,x1r @// x0 = x0 + x1 + vadd.f32 x0i,x0i,x1i + + vadd.f32 sr, x1r, x1r + vadd.f32 si, x1i, x1i + vsub.f32 x1r,x0r,sr @// x1 = x0 - x1 + vsub.f32 x1i,x0i,si + + vstm.f32 pDst, {x0r, x0i} + add pDst, pDst, outPointStep + + vadd.f32 x2r,x2r,x3i + vsub.f32 x2i,x2i,x3r + + vadd.f32 sr, x3r, x3r + vadd.f32 si, x3i, x3i + vsub.f32 t3r, x2r, si + vadd.f32 t3i, x2i, sr + + .ifeqs "\inverse", "TRUE" + vstm.f32 pDst, {t3r, t3i} + add pDst, pDst, outPointStep + vstm.f32 pDst, {x1r, x1i} + add pDst, pDst, outPointStep + vstm.f32 pDst, {x2r, x2i} + add pDst, pDst, setStep + .else + vstm.f32 pDst, {x2r, x2i} + add pDst, pDst, outPointStep + vstm.f32 pDst, {x1r, x1i} + add pDst, pDst, outPointStep + vstm.f32 pDst, {t3r, t3i} + add pDst, pDst, setStep + .endif + + + BGT grpZeroSetLoop\name + + + @// reset pSrc to pDst for the next stage + SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize + mov pDst, pPingPongBuf + + .endm + + + M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp,r4 + FFTSTAGE "FALSE","FALSE",FWD + M_END + + + M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp,r4 + FFTSTAGE "FALSE","TRUE",INV + M_END + + +@// ENDIF @//ARM1136JS + + +@// Guarding implementation by the processor name + + + + + .end diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S new file mode 100644 index 00000000000..00e48d1e6ea --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S @@ -0,0 +1,310 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.S +@// to support float instead of SC32. +@// + +@// +@// Description: +@// Compute a Radix 4 FFT stage for a N point complex signal +@// +@// + + +@// Include standard headers + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@// M_VARIANTS ARM1136JS + +@// Import symbols required from other files +@// (For example tables) + + + + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + +@// IF ARM1136JS + +@//Input Registers + +#define pSrc r0 +#define pDst r2 +#define pTwiddle r1 +#define subFFTNum r6 +#define subFFTSize r7 + + + +@//Output Registers + + +@//Local Scratch Registers + +#define grpCount r12 +#define step r12 /*@// Reuse grpCount*/ +#define outPointStep r3 +#define setCount r8 +#define diff r9 +#define pointStep r14 + +#define t1 r3 /*@// Reuse outPointStep*/ + +@// Real and Imaginary parts used in the inner grp loop +#define x0r s0 +#define x0i s1 +#define x1r s2 +#define x1i s3 +#define x2r s4 +#define x2i s5 +#define x3r s6 +#define x3i s7 + +@// Temporary reg to hold the twiddle multiplies + +#define t0r s8 +#define t0i s9 +#define t2r s10 +#define t2i s11 +#define sr s12 +#define si s13 + + + + + .MACRO FFTSTAGE scaled, inverse , name + + @// Define stack arguments + + + @// Update grpCount and grpSize rightaway inorder to reuse + @// pGrpCount and pGrpSize regs + + LSL grpCount,subFFTSize,#2 + lsr subFFTNum, subFFTNum, #2 + mov subFFTSize, grpCount + + + @// pT0+1 increments pT0 by 8 bytes + @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes + mov pointStep, subFFTNum, lsl #1 + + + @// pOut0+1 increments pOut0 by 8 bytes + @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size + @// bytes + + @// Use setCount as dummy. It's set correctly below. + smull outPointStep, setCount, grpCount, pointStep + + LSL pointStep,pointStep,#2 @// 2*grpSize + + + MOV setCount,pointStep,LSR #3 + + @// Interchange grpLoop and setLoop + +setLoop\name: + + MOV step,#0 + @// Set pSrc and pDst for the grpLoop + + SUB diff,outPointStep,pointStep + + @// Save setCount on stack to reuse the reg + + ADD pSrc,pSrc,diff,LSL #2 @// pSrc += (grpCount-1)*grpStep + ADD pDst,pDst,diff @// pDst += (grpCount-1)*setCount + ADD step,step,diff @// step += (grpCount-1)*setCount + + + + @// Loop on the grps + +grpLoop\name: + + + + @// butterfly loop + add pSrc, pointStep + vldm.f32 pSrc, {x3r, x3i} @// data[1] + add pTwiddle, step + vldm.f32 pTwiddle, {x1r, x1i} @// coef[1] + add pTwiddle, step + vldm.f32 pTwiddle, {x2r, x2i} @// coef[2] + add pSrc, pointStep + vldm.f32 pSrc, {x0r, x0i} @// data[2] + + @// do first complex multiply + vmul.f32 t0r, x3r, x1r + vmul.f32 t0i, x3i, x1r + + .ifeqs "\inverse", "TRUE" + vmla.f32 t0r, x3i, x1i + vmls.f32 t0i, x3r, x1i + vmov.f32 x1r, t0r + vmov.f32 x1i, t0i + .else + vmls.f32 t0r, x3i, x1i + vmla.f32 t0i, x3r, x1i + vmov.f32 x1r, t0r + vmov.f32 x1i, t0i + .endif + + add pTwiddle, pTwiddle, step + vldm pTwiddle, {x3r, x3i} @// coef[3] + sub pTwiddle, pTwiddle, step + + @// do second complex multiply + vmul.f32 t0r, x0r, x2r + vmul.f32 t0i, x0i, x2r + + .ifeqs "\inverse", "TRUE" + vmla.f32 t0r, x0i, x2i + vmls.f32 t0i, x0r, x2i + vmov.f32 x2r, t0r + vmov.f32 x2i, t0i + .else + vmls.f32 t0r, x0i, x2i + vmla.f32 t0i, x0r, x2i + vmov.f32 x2r, t0r + vmov.f32 x2i, t0i + .endif + + add pSrc, pointStep + vldm pSrc, {x0r, x0i} @// data[3] + sub pSrc, pointStep + + SUB pTwiddle,pTwiddle,step,LSL #1 @// reset pTwiddle + SUBS step,step,pointStep @// decrement loop counter + + @// do third complex multiply + SUB pSrc,pSrc,pointStep,LSL #1 @// reset pSrc to data[0] + vmul.f32 t0r, x0r, x3r + vmul.f32 t0i, x0i, x3r + + .ifeqs "\inverse", "TRUE" + vmla.f32 t0r, x0i, x3i + vmls.f32 t0i, x0r, x3i + vmov.f32 x3r, t0r + vmov.f32 x3i, t0i + .else + vmls.f32 t0r, x0i, x3i + vmla.f32 t0i, x0r, x3i + vmov.f32 x3r, t0r + vmov.f32 x3i, t0i + .endif + + vldm pSrc, {x0r, x0i} @// data[0] + + @// finish first stage of 4 point FFT + vadd.f32 x0r,x0r,x2r @// x0 = x0 + x2 (u0) + vadd.f32 x0i,x0i,x2i + + vadd.f32 sr, x2r, x2r + vadd.f32 si, x2i, x2i + vsub.f32 x2r,x0r,sr @// x2 = x0 - x2 (u1) + vsub.f32 x2i,x0i,si + + vadd.f32 x1r,x1r,x3r @// x1 = x1/2 + x3/2 (u2/2) + vadd.f32 x1i,x1i,x3i + + vadd.f32 sr, x3r, x3r + vadd.f32 si, x3i, x3i + vsub.f32 x3r,x1r,sr @// x3 = x1/2 - x3/2 (u3/2) + vsub.f32 x3i,x1i,si + + + @// finish second stage of 4 point FFT + + @// y0 = u1-u2 since twiddle's are stored as -ve values + vsub.f32 x2r,x2r,x1r + vsub.f32 x2i,x2i,x1i + + vadd.f32 sr, x1r, x1r + vadd.f32 si, x1i, x1i + vadd.f32 x1r,x2r,sr @// y2 = u1+u2 + vadd.f32 x1i,x2i,si + vstm pDst, {x2r, x2i} @// store y0 + + vsub.f32 x0r,x0r,x3i @// y3 = u0+ju3 + vadd.f32 x0i,x0i,x3r + + vadd.f32 sr, x3r, x3r + vadd.f32 si, x3i, x3i + vadd.f32 t2r,x0r,si @// y1 = u0-ju3 + vsub.f32 t2i,x0i,sr @// t2 will be same as x2r reg + + .ifeqs "\inverse", "TRUE" + add pDst, outPointStep + vstm pDst, {t2r, t2i} @// store y1 + add pDst, outPointStep + vstm pDst, {x1r, x1i} @// store y2 + add pDst, outPointStep + vstm pDst, {x0r, x0i} @// store y3 + sub pDst, outPointStep + .else + add pDst, outPointStep + vstm pDst, {x0r, x0i} @// store y1 + add pDst, outPointStep + vstm pDst, {x1r, x1i} @// store y2 + add pDst, outPointStep + vstm pDst, {t2r, t2i} @// store y3 + sub pDst, outPointStep + .endif + + SUB pDst,pDst,outPointStep, LSL #1 @// reset pDst + @// update the pDst for the next grp + SUBGE pDst,pDst,pointStep + @// update the pSrc for the next grp + SUBGE pSrc,pSrc,pointStep,LSL #2 + + + BGE grpLoop\name + + ADD pSrc,pSrc,#8 @// pSrc += 1; for the next set + ADD pDst,pDst,#8 @// pDst += 1; for the next set + + SUBS setCount,setCount,#1 @// decrement loop counter + + + BGT setLoop\name + + @// Reset and Swap pSrc and pDst for the next stage + MOV t1,pDst + SUB pDst,pSrc,subFFTNum,LSL #3 + SUB pSrc,t1,subFFTNum,LSL #3 + + .endm + + + M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4 + FFTSTAGE "FALSE","FALSE",FWD + M_END + + M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4 + FFTSTAGE "FALSE","TRUE",INV + M_END + + +@// ENDIF @//ARM1136JS + + + +@// Guarding implementation by the processor name + + .end diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S new file mode 100644 index 00000000000..4ac2da47ac3 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S @@ -0,0 +1,386 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S +@// to support float instead of SC32. +@// + +@// +@// Description: +@// Compute a first stage Radix 8 FFT stage for a N point complex signal +@// +@// + + +@// Include standard headers + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@// M_VARIANTS ARM1136JS + +@// Import symbols required from other files +@// (For example tables) + + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + +@// IF ARM1136JS + +@//Input Registers + +#define pSrc r0 +#define pDst r2 +#define pTwiddle r1 +#define subFFTNum r6 +#define subFFTSize r7 +#define pPingPongBuf r5 + + +@//Output Registers + + +@//Local Scratch Registers + +#define grpSize r14 +#define step1 r3 +#define step2 r8 +#define setCount r14 /*@// Reuse grpSize as setCount*/ +#define pointStep r12 + +#define t0 r4 +@// Real and Imaginary parts + +#define x0r s0 +#define x0i s1 +#define x1r s2 +#define x1i s3 +#define x2r s4 +#define x2i s5 +#define x3r s6 +#define x3i s7 +#define t3r s8 /*@// Temporarily hold x3r and x3i*/ +#define t3i s9 +#define t1r s4 +#define t1i s5 +#define sr s10 +#define si s11 +#define roothalf s12 + +@// Define macros to load/store two float regs from/to the stack. + .macro M_VSTM r0, r1, p + .set _Offset, _Workspace + \p\()_F + add t0, sp, #_Offset + vstm.f32 t0, {\r0, \r1} + .endm + + .macro M_VLDM r0, r1, p + .set _Offset, _Workspace + \p\()_F + add t0, sp, #_Offset + vldm.f32 t0, {\r0, \r1} + .endm + +@// Define constants + + .MACRO FFTSTAGE scaled, inverse , name + + @// Define stack arguments + + + @// Update grpCount and grpSize rightaway inorder to reuse + @// pSubFFTSize and pSubFFTNum regs + + mov subFFTSize, #8 + lsr grpSize, subFFTNum, #3 + mov subFFTNum, grpSize + + + @// pT0+1 increments pT0 by 8 bytes + @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes + @// Note: setCount = grpSize/8 (reuse the updated grpSize for + @// setCount) + MOV pointStep,grpSize,LSL #3 + + + @// Calculate the step of input data for the next set + MOV step1,grpSize,LSL #4 + MOV step2,pointStep,LSL #3 + SUB step2,step2,pointStep @// step2 = 7*pointStep + + + @// grp = 0 a special case since all the twiddle factors are 1 + @// Loop on the sets + + movw t0,#0x04f3 + movt t0,#0x3f35 + vmov.f32 roothalf, t0 @// roothalf = sqrt(1/2) + +grpZeroSetLoop\name: + + vldm.f32 pSrc, {x0r, x0i} @// x0 + add pSrc, step1 + vldm.f32 pSrc, {x1r, x1i} @// x2 + add pSrc, step1 + vldm.f32 pSrc, {x2r, x2i} @// x4 + add pSrc, step1 + vldm.f32 pSrc, {x3r, x3i} @// x6 + add pSrc, step1 + + SUB pSrc, pSrc, step2 + + @// finish first stage of 8 point FFT and save on stack + + vadd.f32 x0r,x0r,x2r @// u0 + vadd.f32 x0i,x0i,x2i + + vadd.f32 sr, x2r, x2r + vadd.f32 si, x2i, x2i + vsub.f32 x2r,x0r,sr @// u1 + vsub.f32 x2i,x0i,si + + M_VSTM x0r,x0i, pU0 + M_VSTM x2r,x2i, pU1 + + vadd.f32 x1r,x1r,x3r @// u4 + vadd.f32 x1i,x1i,x3i + + vadd.f32 sr, x3r, x3r + vadd.f32 si, x3i, x3i + vsub.f32 x3r,x1r,sr @// u5 + vsub.f32 x3i,x1i,si + + M_VSTM x1r,x1i, pU4 + M_VSTM x3r,x3i, pU5 + + + vldm pSrc, {x0r, x0i} @// x1 + add pSrc, step1 + vldm pSrc, {x1r, x1i} @// x3 + add pSrc, step1 + vldm pSrc, {x2r, x2i} @// x5 + add pSrc, step1 + vldm pSrc, {x3r, x3i} @// x7 + add pSrc, #8 + + SUB pSrc, pSrc, step2 + + vadd.f32 x0r,x0r,x2r @// u2 + vadd.f32 x0i,x0i,x2i + + vadd.f32 sr, x2r, x2r + vadd.f32 si, x2i, x2i + vsub.f32 x2r,x0r,sr @// u3 + vsub.f32 x2i,x0i,si + + M_VSTM x2r,x2i, pU3 + + vadd.f32 x1r,x1r,x3r @// u6 + vadd.f32 x1i,x1i,x3i + + vadd.f32 sr, x3r, x3r + vadd.f32 si, x3i, x3i + vsub.f32 x3r,x1r,sr @// u7 + vsub.f32 x3i,x1i,si + + @// finish second and third stage of 8 point FFT + + M_VSTM x3r,x3i, pU7 + M_VLDM x2r,x2i, pU0 + + @// Decrement setcount + SUBS setCount,setCount,#1 + M_VLDM x3r,x3i, pU4 + + vadd.f32 x0r,x0r,x1r @// v4 + vadd.f32 x0i,x0i,x1i + + vadd.f32 sr, x1r, x1r + vadd.f32 si, x1i, x1i + vsub.f32 x1r,x0r,sr @// v6 + vsub.f32 x1i,x0i,si + + vadd.f32 x2r,x2r,x3r @// v0 + vadd.f32 x2i,x2i,x3i + + vadd.f32 sr, x3r, x3r + vadd.f32 si, x3i, x3i + vsub.f32 x3r,x2r,sr @// v2 + vsub.f32 x3i,x2i,si + + + + vadd.f32 x2r,x2r,x0r @// y0 + vadd.f32 x2i,x2i,x0i + + vadd.f32 sr, x0r, x0r + vadd.f32 si, x0i, x0i + vsub.f32 x0r,x2r,sr @// y4 + vsub.f32 x0i,x2i,si + + vstm pDst, {x2r, x2i} @// store y0 + add pDst, step1 + + vadd.f32 x3r,x3r,x1i @// y6 + vsub.f32 x3i,x3i,x1r + + vadd.f32 sr, x1r, x1r + vadd.f32 si, x1i, x1i + vsub.f32 t1r,x3r,si @// t1r=x2r reg;t1i=x2i reg + vadd.f32 t1i,x3i,sr @// y2 + + .ifeqs "\inverse", "TRUE" + vstm pDst, {t1r, t1i} @// store y2 + add pDst, step1 + vstm pDst, {x0r, x0i} @// store y4 + add pDst, step1 + vstm pDst, {x3r, x3i} @// store y6 + add pDst, step1 + .else + vstm pDst, {x3r, x3i} @// store y2 + add pDst, step1 + vstm pDst, {x0r, x0i} @// store y4 + add pDst, step1 + vstm pDst, {t1r, t1i} @// store y6 + add pDst, step1 + .endif + + SUB pDst, pDst, step2 @// set pDst to y1 + + + M_VLDM x0r,x0i,pU1 @// Load u1,u3,u5,u7 + M_VLDM x1r,x1i,pU5 + M_VLDM x3r,x3i,pU7 + + vsub.f32 x0r,x0r,x1i @// v1 + vadd.f32 x0i,x0i,x1r + vadd.f32 sr, x1r, x1r + vadd.f32 si, x1i, x1i + vadd.f32 t1r,x0r,si @// t1r=x2r reg;t1i=x2i reg + vsub.f32 t1i,x0i,sr @// v3 + + M_VLDM x1r,x1i,pU3 + + vsub.f32 x1r,x1r,x3i @// v5 + vadd.f32 x1i,x1i,x3r + + vadd.f32 sr, x3r, x3r + vadd.f32 si, x3i, x3i + vadd.f32 t3r,x1r,si @// t3i = x3i + vsub.f32 t3i,x1i,sr @// v7 + + @// store v5 as (v5.r - v5.i,v5.r + v5.i) + @// store v7 as (v7.i + v7.r,v7.i - v7.r) + + vadd.f32 x3r,t3i,t3r @// v7 + vsub.f32 x3i,t3i,t3r + + vsub.f32 x1r,x1r,x1i @// v5 + vadd.f32 x1i, x1i + vadd.f32 x1i,x1r,x1i + + vmul.f32 x3r, x3r, roothalf @// (v7.i + v7.r)*(1/sqrt(2)) + vmul.f32 x3i, x3i, roothalf @// (v7.i - v7.r)*(1/sqrt(2)) + vmul.f32 x1r, x1r, roothalf @// (v5.r - v5.i)*(1/sqrt(2)) + vmul.f32 x1i, x1i, roothalf @// (v5.r + v5.i)*(1/sqrt(2)) + + vadd.f32 x2r,x2r,x3r @// y7 + vadd.f32 x2i,x2i,x3i + + vadd.f32 sr, x3r, x3r + vadd.f32 si, x3i, x3i + vsub.f32 x3r,x2r,sr @// y3 + vsub.f32 x3i,x2i,si + + + vsub.f32 x0r,x0r,x1r @// y5 + vsub.f32 x0i,x0i,x1i + + vadd.f32 sr, x1r, x1r + vadd.f32 si, x1i, x1i + vadd.f32 x1r,x0r,sr @// y1 + vadd.f32 x1i,x0i,si + + .ifeqs "\inverse", "TRUE" + vstm pDst, {x1r, x1i} @// store y1 + add pDst, step1 + vstm pDst, {x3r, x3i} @// store y3 + add pDst, step1 + vstm pDst, {x0r, x0i} @// store y5 + add pDst, step1 + vstm pDst, {x2r, x2i} @// store y7 + add pDst, #8 + .else + vstm pDst, {x2r, x2i} @// store y1 + add pDst, step1 + vstm pDst, {x0r, x0i} @// store y3 + add pDst, step1 + vstm pDst, {x3r, x3i} @// store y5 + add pDst, step1 + vstm pDst, {x1r, x1i} @// store y7 + add pDst, #8 + .endif + + SUB pDst, pDst, step2 @// update pDst for the next set + + + BGT grpZeroSetLoop\name + + + @// reset pSrc to pDst for the next stage + SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize + mov pDst, pPingPongBuf + + + .ENDM + + + + + + @// Allocate stack memory required by the function + + @// Ensure 8 byte alignment to use M_VLDM + M_ALLOC8 pU0, 8 + M_ALLOC8 pU1, 8 + M_ALLOC8 pU3, 8 + M_ALLOC8 pU4, 8 + M_ALLOC8 pU5, 8 + M_ALLOC8 pU7, 8 + + M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4 + FFTSTAGE "FALSE","FALSE",FWD + M_END + + @// Allocate stack memory required by the function + + @// Ensure 8 byte alignment to use M_VLDM + M_ALLOC8 pU0, 8 + M_ALLOC8 pU1, 8 + M_ALLOC8 pU3, 8 + M_ALLOC8 pU4, 8 + M_ALLOC8 pU5, 8 + M_ALLOC8 pU7, 8 + + M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4 + FFTSTAGE "FALSE","TRUE",INV + M_END + +@// ENDIF @//ARM1136JS + + + +@// Guarding implementation by the processor name + + + .END diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S new file mode 100644 index 00000000000..25b4976ca80 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S @@ -0,0 +1,161 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of omxSP_FFTFwd_CToC_SC32_Sfs_s.S +@// to support float instead of SC32. +@// + +@// +@// Description: +@// Compute an inverse FFT for a complex signal +@// +@// + + +@// Include standard headers + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@// M_VARIANTS ARM1136JS + +@// Import symbols required from other files +@// (For example tables) + + .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + +@/ IF ARM1136JS + +@//Input Registers + +#define pSrc r0 +#define pDst r1 +#define pFFTSpec r2 + + +@// Output registers +#define result r0 + +@//Local Scratch Registers + +#define argTwiddle r1 +#define argDst r2 +#define argScale r4 +#define pTwiddle r4 +#define pOut r5 +#define subFFTSize r7 +#define subFFTNum r6 +#define N r6 +#define order r14 +#define diff r9 +#define count r8 +#define diffMinusOne r2 +#define round r3 + +#define x0r s0 +#define x0i s1 + + + + + @// Allocate stack memory required by the function + + @// Write function header + M_START omxSP_FFTFwd_CToC_FC32_Sfs_vfp,r11 + +@ Structure offsets for FFTSpec + .set ARMsFFTSpec_N, 0 + .set ARMsFFTSpec_pBitRev, 4 + .set ARMsFFTSpec_pTwiddle, 8 + .set ARMsFFTSpec_pBuf, 12 + + @// Define stack arguments + + @// Read the size from structure and take log + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + + @// Read other structure parameters + LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] + LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] + + CLZ order,N @// N = 2^order + RSB order,order,#31 + MOV subFFTSize,#1 + @//MOV subFFTNum,N + + + CMP order,#1 + BGT orderGreaterthan1 @// order > 1 + @// order = 0, 1 + vldmlt.f32 pSrc, {x0r, x0i} + vstmlt.f32 pDst, {x0r, x0i} + + MOVLT pSrc,pDst + BLT End + + @// Handle order = 1 + MOV argDst,pDst @// Set input args to fft stages + MOV argTwiddle,pTwiddle + BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp + B End + + + +orderGreaterthan1: + + TST order, #2 @// Set input args to fft stages + MOVNE argDst,pDst + MOVEQ argDst,pOut + MOVEQ pOut,pDst @// Pass the first stage destination in RN5 + MOV argTwiddle,pTwiddle + + @//check for even or odd order + + @// NOTE: The following combination of BL's would work fine + @// eventhough the first BL would corrupt the flags. This is + @// because the end of the "grpZeroSetLoop" loop inside + @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp + @// sets the Z flag to EQ + + TST order,#0x00000001 + BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp + BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp + +unscaledRadix4Loop: + CMP subFFTNum,#1 + BEQ End + BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp + B unscaledRadix4Loop + + +End: + @// Set return value + MOV result, #OMX_Sts_NoErr + + @// Write function tail + M_END + +@// ENDIF @//ARM1136JS + + + @// Guarding implementation by the processor name + + + + .end diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S new file mode 100644 index 00000000000..dd1690ad10b --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S @@ -0,0 +1,328 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s +@// to support float instead of SC32. +@// + +@// +@// Description: +@// Compute FFT for a real signal +@// +@// + + +@// Include standard headers + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@// M_VARIANTS ARM1136JS + +@// Import symbols required from other files +@// (For example tables) + + .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + +@// IF ARM1136JS + +@//Input Registers + +#define pSrc r0 +#define pDst r1 +#define pFFTSpec r2 + + +@// Output registers +#define result r0 + +@//Local Scratch Registers + +@// N=1 case +#define scaleMinusOne r2 +#define rnd r2 +#define zero r8 +#define Zero r9 + + +#define argTwiddle r1 +#define argDst r2 +#define argScale r4 +#define pTwiddle r4 +#define pOut r5 +#define subFFTSize r7 +#define subFFTNum r6 +#define N r6 +#define order r14 +#define diff r9 +#define count r8 +#define diffMinusOne r10 +#define round r3 + +#define step r3 +#define step1 r6 +#define twStep r12 +#define pTwiddleTmp r14 +#define t0 r12 +#define t1 r14 /*@// pTwiddleTmp*/ +#define t2 r0 +#define t3 r1 /*@// pSrc,argTwiddle*/ +#define t4 r6 +#define t5 r7 /*@// step1,subFFTSize*/ + +#define x0r s0 +#define x0i s1 +#define y0r s2 +#define y0i s3 +#define x1r s4 +#define x1i s5 +#define w1r s2 +#define w1i s3 +#define w0r s6 +#define w0i s7 +#define y1r s2 /*@// w1r,w1i*/ +#define y1i s3 +#define st0 s8 +#define st1 s9 +#define st2 s10 +#define st3 s11 +#define st4 s12 +#define st5 s13 +#define half s15 + + + + + @// Allocate stack memory required by the function + + + + @// Write function header + M_START omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11 + +@ Structure offsets for FFTSpec + .set ARMsFFTSpec_N, 0 + .set ARMsFFTSpec_pBitRev, 4 + .set ARMsFFTSpec_pTwiddle, 8 + .set ARMsFFTSpec_pBuf, 12 + + @// Define stack arguments + + @// Setup half value + movw N, #0 @// Use N as a temp. + movt N, #0x3f00 + vmov.f32 half, N + + @// Read the size from structure and take log + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + + @// Read other structure parameters + LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] + LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] + + @// N=1 Treat seperately + CMP N,#1 + BGT sizeGreaterThanOne + // N<=1 is not supported + @// Set return value + MOV result, #OMX_Sts_NoErr + B FunctionEnd + +sizeGreaterThanOne: + @// Do a N/2 point complex FFT including the scaling + + MOV N,N,ASR #1 @// N/2 point complex FFT + CLZ order,N @// N = 2^order + RSB order,order,#31 + MOV subFFTSize,#1 + @//MOV subFFTNum,N + + + CMP order,#1 + BGT orderGreaterthan1 @// order > 1 + vldmlt.f32 pSrc, {x0r, x0i} + vstmlt.f32 pOut, {x0r, x0i} + MOVLT pSrc,pOut + MOVLT argDst,pDst + BLT FFTEnd + + MOV argDst,pOut @// Set input args to fft stages + MOV pOut,pDst @// Set input args to fft stages + MOV argTwiddle,pTwiddle + + BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp + B finalComplexToRealFixup + +orderGreaterthan1: + + TST order, #2 @// Set input args to fft stages + MOVEQ argDst,pDst + MOVNE argDst,pOut + MOVNE pOut,pDst @// Pass the first stage dest in RN5 + MOV argTwiddle,pTwiddle + + @//check for even or odd order + + @// NOTE: The following combination of BL's would work fine + @// eventhough the first BL would corrupt the flags. This is + @// because the end of the "grpZeroSetLoop" loop inside + @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets + @// the Z flag to EQ + + TST order,#0x00000001 + BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp + BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp + +unscaledRadix4Loop: + CMP subFFTNum,#1 + BEQ FFTEnd + BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp + B unscaledRadix4Loop + +FFTEnd: +finalComplexToRealFixup: + + @// step = N/2 * 8 bytes + MOV step,subFFTSize,LSL #3 + @// twStep = 3N/8 * 8 bytes pointing to W^1 + SUB twStep,step,subFFTSize,LSL #1 + @// step1 = N/4 * 8 = N/2*4 bytes + MOV step1,subFFTSize,LSL #2 + @// (N/4-1)*8 bytes + SUB step1,step1,#8 + + @// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)] + @// 1/2 [(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] + @// 1/2 [2a+j0] - j [0+j2b] + @// (a+b, 0) + + @// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)] + @// 1/2 [(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] + @// 1/2 [2a+j0] + j [0+j2b] + @// (a-b, 0) + + @// F(0) and F(N/2) + vldm.f32 pSrc!, {x0r, x0i} + vadd.f32 y0r,x0r,x0i @// F(0) = (2(Z0.r+Z0.i) , 0) + vsub.f32 x0r,x0r,x0i @// F(N/2) = (2(Z0.r-Z0.i) , 0) + vsub.f32 y0i, y0i @ y0i and x0i set to 0.0 + vsub.f32 x0i, x0i + + add argDst, step + vstm.f32 argDst, {x0r, x0i} @// {x0r,x0i}->[argDst, step] + sub argDst, step + vstm.f32 argDst!, {y0r, y0i} + + SUBS subFFTSize,subFFTSize,#2 + + ADD pTwiddleTmp,argTwiddle,#8 @// W^2 + ADD argTwiddle,argTwiddle,twStep @// W^1 + BLT End + BEQ lastElement + + + @// F(k) = 1/2 [Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] + @// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since + @// both of them require Z(1) and Z(N/2-1) + + ASR subFFTSize,subFFTSize,#1 +evenOddButterflyLoop: + + SUB step,step,#16 @// (N/2-2)*8 bytes + + add pSrc, step + vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step] + sub pSrc, step + vldm.f32 pSrc!, {x0r, x0i} + add argTwiddle, step1 + vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step1] + sub argTwiddle, step1 + vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8 + + SUB step1,step1,#8 + SUBS subFFTSize,subFFTSize,#1 + + vsub.f32 st2,x0r,x1r @// a-c + vadd.f32 st3,x0i,x1i @// b+d + vadd.f32 st0,x0r,x1r @// a+c + vsub.f32 st1,x0i,x1i @// b-d + + vmul.f32 x1r,w1r,st2 + vmul.f32 x1i,w1r,st3 + vmla.f32 x1r,w1i,st3 @// x1r = w1r*st2 + w1i*st3 + @//RSB x1r,x1r,#0 + vmls.f32 x1i,w1i,st2 @// x1i = w1r*st3 - wli*st2 + + vsub.f32 y1r, st0, x1i + vadd.f32 y1i, x1r, st1 + vneg.f32 y1i, y1i + + vmul.f32 x0r,w0r,st2 + vmul.f32 x0i,w0r,st3 + vmls.f32 x0r,w0i,st3 @// x0r = w0r*st2 - w0i*st3 + vmla.f32 x0i,w0i,st2 @// x0i = w0r*st3 + x0i*st1 + + vsub.f32 st4,st0,x0i @// F(1) + vadd.f32 st5,x0r,st1 + + + vmul.f32 y1r, half + vmul.f32 y1i, half + vmul.f32 st4, half + vmul.f32 st5, half + + add argDst, step + vstm.f32 argDst, {y1r, y1i} @// {y1r,y1i} -> [argDst,step] + sub argDst, step + vstm.f32 argDst!, {st4, st5} + + + MOV t0,argTwiddle @// swap ptr for even and odd twiddles + MOV argTwiddle,pTwiddleTmp + MOV pTwiddleTmp,t0 + + BGT evenOddButterflyLoop + + @// Last element can be expanded as follows + @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] + @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] + @// 1/2[2a+j0] + j (c+jd) [0+j2b] + @// (a-bc, -bd) + +lastElement: + vldm.f32 pSrc, {x0r, x0i} + vneg.f32 x0i, x0i + vstm.f32 argDst, {x0r, x0i} + +End: + @// Set return value + MOV result, #OMX_Sts_NoErr + +FunctionEnd: + @// Write function tail + M_END + +@// ENDIF @//ARM1136JS + + + @// Guarding implementation by the processor name + + + + .end diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S new file mode 100644 index 00000000000..d6a47652738 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S @@ -0,0 +1,227 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of omxSP_FFTInv_CCSToR_S32_Sfs_s.s +@// to support float instead of SC32. +@// + +@// +@// Description: +@// Compute an inverse FFT for a complex signal +@// +@// + + +@// Include standard headers + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@// M_VARIANTS ARM1136JS + +@// Import symbols required from other files +@// (For example tables) + + .extern armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp + .extern armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + +@// IF ARM1136JS + +@//Input Registers + +#define pSrc r0 +#define pDst r1 +#define pFFTSpec r2 + + +@// Output registers +#define result r0 + +@//Local Scratch Registers + + +#define argTwiddle r1 +#define argDst r2 +#define argScale r4 +#define pTwiddle r4 +#define pOut r5 +#define subFFTSize r7 +#define subFFTNum r6 +#define N r6 +#define order r14 +#define diff r9 +@// Total num of radix stages required to comple the FFT*/ +#define count r8 + +#define round r3 + +#define x0r s0 +#define x0i s1 +#define y0r s2 +#define y0i s3 +#define x1r s4 +#define x1i s5 +#define w1r s2 +#define w1i s3 +#define w0r s6 +#define w0i s7 +#define y1r s2 /*@// w1r,w1i*/ +#define y1i s3 +#define st0 s8 +#define st1 s9 +#define st2 s10 +#define st3 s11 +#define st4 s12 +#define st5 s13 +#define fscale s2 +#define fone s3 + + + + @// Allocate stack memory required by the function + M_ALLOC4 pDstOnStack, 4 + M_ALLOC4 pFFTSpecOnStack, 4 + + @// Write function header + M_START omxSP_FFTInv_CCSToR_F32_Sfs_vfp,r11 + +@ Structure offsets for FFTSpec + .set ARMsFFTSpec_N, 0 + .set ARMsFFTSpec_pBitRev, 4 + .set ARMsFFTSpec_pTwiddle, 8 + .set ARMsFFTSpec_pBuf, 12 + + @// Define stack arguments + + @// Read the size from structure and take log + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + + + + @// N=1 Treat seperately + CMP N,#1 + BGT sizeGreaterThanOne + vldr.f32 x0r, [pSrc] + vstr.f32 x0r, [pDst] + + B End + +sizeGreaterThanOne: + M_STR pDst,pDstOnStack @// store all the pointers + M_STR pFFTSpec,pFFTSpecOnStack + + + @// Call the preTwiddle Radix2 stage before doing the compledIFFT + + BL armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp + + +complexIFFT: + + M_LDR pFFTSpec,pFFTSpecOnStack + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] + LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] + + ASR N,N,#1 @// N/2 point complex IFFT + ADD pSrc,pOut,N,LSL #3 @// set pSrc as pOut1 + M_LDR pDst,pDstOnStack + + CLZ order,N @// N = 2^order + RSB order,order,#31 + MOV subFFTSize,#1 + + CMP order,#1 + BGT orderGreaterthan1 @// order > 1 + vldmlt.f32 pSrc, {x0r, x0i} + vstmlt.f32 pDst, {x0r, x0i} + + MOVLT pSrc,pDst + BLT FFTEnd + + MOV argDst,pDst @// Set input args to fft stages + MOV argTwiddle,pTwiddle + + BL armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp + B FFTEnd + + +orderGreaterthan1: + + TST order, #2 @// Set input args to fft stages + MOVNE argDst,pDst + MOVEQ argDst,pOut + MOVEQ pOut,pDst @// Pass the first stage destination in RN5 + MOV argTwiddle,pTwiddle + + + @//check for even or odd order + + @// NOTE: The following combination of BL's would work fine + @// eventhough the first BL would corrupt the flags. This is + @// because the end of the "grpZeroSetLoop" loop inside + @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets + @// the Z flag to EQ + + TST order,#0x00000001 + BLEQ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp + BLNE armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp + +unscaledRadix4Loop: + CMP subFFTNum,#1 + BEQ FFTEnd + BL armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp + B unscaledRadix4Loop + +FFTEnd: + + vldm.f32 pSrc, {x0r, x0i} + + vmov.f32 fscale, subFFTSize + vcvt.f32.s32 fscale, fscale @// fscale = N as a float + mov round, #1 + vmov.f32 fone, round + vcvt.f32.s32 fone, fone + vdiv.f32 fscale, fone, fscale @// fscale = 1/N + +scaleFFTData: @// N = subFFTSize + SUBS subFFTSize,subFFTSize,#1 + vmul.f32 x0r, x0r, fscale + vmul.f32 x0i, x0i, fscale + vstm.f32 pSrc!, {x0r, x0i} + vldmgt.f32 pSrc, {x0r, x0i} + + BGT scaleFFTData + + +End: + @// Set return value + MOV result, #OMX_Sts_NoErr + + @// Write function tail + M_END + +@// ENDIF @//ARM1136JS + + + @// Guarding implementation by the processor name + + + + .end diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S new file mode 100644 index 00000000000..64aa5da8c5a --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S @@ -0,0 +1,180 @@ +@// +@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@// +@// Use of this source code is governed by a BSD-style license +@// that can be found in the LICENSE file in the root of the source +@// tree. An additional intellectual property rights grant can be found +@// in the file PATENTS. All contributing project authors may +@// be found in the AUTHORS file in the root of the source tree. +@// +@// This is a modification of omxSP_FFTInv_CToC_SC32_Sfs_s.s +@// to support float instead of SC32. +@// + +@// +@// Description: +@// Compute an inverse FFT for a complex signal +@// +@// + + +@// Include standard headers + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@// M_VARIANTS ARM1136JS + +@// Import symbols required from other files +@// (For example tables) + + .extern armSP_FFTInv_CToC_FC32_Sfs_Radix2_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp + .extern armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp + +@// Set debugging level +@//DEBUG_ON SETL {TRUE} + + + +@// Guarding implementation by the processor name + +@// IF ARM1136JS + +@//Input Registers + +#define pSrc r0 +#define pDst r1 +#define pFFTSpec r2 + + +@// Output registers +#define result r0 + +@//Local Scratch Registers + +#define argTwiddle r1 +#define argDst r2 +#define argScale r4 +#define pTwiddle r4 +#define pOut r5 +#define subFFTSize r7 +#define subFFTNum r6 +#define N r6 +#define order r14 +#define diff r9 +#define count r8 +#define diffMinusOne r2 +#define round r3 + +#define x0r s0 +#define x0i s1 +#define fone s2 +#define fscale s3 + + + @// Allocate stack memory required by the function + + @// Write function header + M_START omxSP_FFTInv_CToC_FC32_Sfs_vfp,r11 + +@ Structure offsets for FFTSpec + .set ARMsFFTSpec_N, 0 + .set ARMsFFTSpec_pBitRev, 4 + .set ARMsFFTSpec_pTwiddle, 8 + .set ARMsFFTSpec_pBuf, 12 + + @// Define stack arguments + + @// Read the size from structure and take log + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + + @// Read other structure parameters + LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] + LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] + + CLZ order,N @// N = 2^order + RSB order,order,#31 + MOV subFFTSize,#1 + @//MOV subFFTNum,N + + CMP order,#1 + BGT orderGreaterthan1 @// order > 1 + @// Order = 0 or 1 + vldmlt.f32 pSrc, {x0r, x0i} + vstmlt.f32 pDst, {x0r, x0i} + + MOVLT pSrc,pDst + BLT FFTEnd + + @// Handle order = 1 + MOV argDst,pDst + MOV argTwiddle,pTwiddle + + BL armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp + B FFTEnd + +orderGreaterthan1: + + TST order, #2 @// Set input args to fft stages + MOVNE argDst,pDst + MOVEQ argDst,pOut + MOVEQ pOut,pDst @// Pass the first stage dest in RN5 + MOV argTwiddle,pTwiddle + + + @//check for even or odd order + @// NOTE: The following combination of BL's would work fine + @// eventhough the first BL would corrupt the flags. This is + @// because the end of the "grpZeroSetLoop" loop inside + @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets + @// the Z flag to EQ + + TST order,#0x00000001 + BLEQ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp + BLNE armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp + +unscaledRadix4Loop: + CMP subFFTNum,#1 + BEQ FFTEnd + BL armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp + B unscaledRadix4Loop + + +FFTEnd: + + vldm.f32 pSrc, {x0r, x0i} + + vmov.f32 fscale, subFFTSize + vcvt.f32.s32 fscale, fscale @// fscale = N as a float + movw round, #0 + movt round, #0x3f80 @// round = 1.0 + vmov.f32 fone, round + vdiv.f32 fscale, fone, fscale @// fscale = 1/N +scaleFFTData: @// N = subFFTSize + SUBS subFFTSize,subFFTSize,#1 + vmul.f32 x0r, x0r, fscale + vmul.f32 x0i, x0i, fscale + vstm.f32 pSrc, {x0r, x0i} + add pSrc, #8 + vldmgt.f32 pSrc, {x0r, x0i} + + bgt scaleFFTData + + + @// Set return value + MOV result, #OMX_Sts_NoErr + + @// Write function tail + M_END + +@// ENDIF @//ARM1136JS + + + @// Guarding implementation by the processor name + + + + .end diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/detect.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/detect.c new file mode 100644 index 00000000000..b74220a92fc --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/detect.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include <cpu-features.h> + +#include "android/log.h" +#include "dl/sp/api/omxSP.h" + +int HasArmNeon() { + return (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0; +} + +static void SetFFTRoutines() { + /* + * Choose the correct (NEON or non-NEON) routines for both the + * forward and inverse FFTs + */ + if (HasArmNeon()) { + __android_log_print(ANDROID_LOG_INFO, "OpenMAX DL FFT", + "Using NEON FFT"); + omxSP_FFTFwd_RToCCS_F32 = omxSP_FFTFwd_RToCCS_F32_Sfs; + omxSP_FFTInv_CCSToR_F32 = omxSP_FFTInv_CCSToR_F32_Sfs; + } else { + __android_log_print(ANDROID_LOG_INFO, "OpenMAX DL FFT", + "Using non-NEON FFT"); + omxSP_FFTFwd_RToCCS_F32 = omxSP_FFTFwd_RToCCS_F32_Sfs_vfp; + omxSP_FFTInv_CCSToR_F32 = omxSP_FFTInv_CCSToR_F32_Sfs_vfp; + } +} + +/* + * FIXME: It would be beneficial to use the GCC ifunc attribute to + * select the appropriate function at load time. This is apparently + * not supported on Android at this time. (Compiler warning that the + * ifunc attribute is ignored.) + */ + +/* + * Forward FFT. Detect if NEON is supported and update function + * pointers to the correct routines for both the forward and inverse + * FFTs. Then run the forward FFT routine. + */ +static OMXResult DetectForwardRealFFT( + const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec) { + SetFFTRoutines(); + return omxSP_FFTFwd_RToCCS_F32(pSrc, pDst, pFFTSpec); +} + +/* + * Inverse FFT. Detect if NEON is supported and update function + * pointers to the correct routines for both the forward and inverse + * FFTs. Then run the inverse FFT routine. + */ +static OMXResult DetectInverseRealFFT( + const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec) { + SetFFTRoutines(); + return omxSP_FFTInv_CCSToR_F32(pSrc, pDst, pFFTSpec); +} + +/* + * Implementation of the forward and inverse real float FFT. + * Initialize to detection routine which will update the pointer to + * the correct routine and then call the correct one. + */ +OMXResult (*omxSP_FFTFwd_RToCCS_F32)( + const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec) = DetectForwardRealFFT; + +OMXResult (*omxSP_FFTInv_CCSToR_F32)( + const OMX_F32* pSrc, + OMX_F32* pDst, + const OMXFFTSpec_R_F32* pFFTSpec) = DetectInverseRealFFT; diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S index f375991f7dd..f9dd26e491e 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S @@ -22,8 +22,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S new file mode 100644 index 00000000000..950defde8ca --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S @@ -0,0 +1,409 @@ +@ +@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ +@ Some code in this file was originally from file +@ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as +@ follows. It has been relicensed with permission from the copyright holders. +@ + +@ +@ OpenMAX DL: v1.0.2 +@ Last Modified Revision: 7485 +@ Last Modified Date: Fri, 21 Sep 2007 +@ +@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +@ + +@ +@ Description: +@ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT. +@ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation. +@ It implements both "scaled"(by 1/2) and "unscaled" versions of the above +@ formula. +@ + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +@//Input Registers +#define pSrc r0 +#define pDst r1 +#define pFFTSpec r2 +#define scale r3 + +@ Output registers +#define result r0 + +@//Local Scratch Registers +#define argTwiddle r1 +#define argDst r2 +#define argScale r4 +#define tmpOrder r4 +#define pTwiddle r4 +#define pOut r5 +#define subFFTSize r7 +#define subFFTNum r6 +#define N r6 +#define order r14 +#define diff r9 +@ Total num of radix stages to comple the FFT. +#define count r8 +#define x0r r4 +#define x0i r5 +#define diffMinusOne r2 +#define round r3 +#define pOut1 r2 +#define size r7 +#define step r8 +#define step1 r9 +#define step2 r10 +#define twStep r10 +#define pTwiddleTmp r11 +#define argTwiddle1 r12 +#define zero r14 + +@ Neon registers +#define dX0 D0.S16 +#define dX0S32 D0.S32 +#define dShift D1.S16 +#define dX1 D1.S16 +#define dX1S32 D1.S32 +#define dY0 D2.S16 +#define dY1 D3.S16 +#define dX0r D0.S16 +#define dX0rS32 D0.S32 +#define dX0i D1.S16 +#define dX1r D2.S16 +#define dX1i D3.S16 +#define qX1 Q1.S16 +#define dW0r D4.S16 +#define dW0i D5.S16 +#define dW1r D6.S16 +#define dW1i D7.S16 +#define dW0rS32 D4.S32 +#define dW0iS32 D5.S32 +#define dW1rS32 D6.S32 +#define dW1iS32 D7.S32 +#define dT0 D8.S16 +#define dT1 D9.S16 +#define dT2 D10.S16 +#define dT3 D11.S16 +#define qT0 Q6.S32 +#define qT1 Q7.S32 +#define qT2 Q8.S32 +#define qT3 Q9.S32 +#define dY0r D4.S16 +#define dY0i D5.S16 +#define dY1r D6.S16 +#define dY1i D7.S16 +#define qY1 Q3.S16 +#define dY2 D4.S16 +#define dY3 D5.S16 +#define dW0 D6.S16 +#define dW1 D7.S16 +#define dW0Tmp D10.S16 +#define dW1Neg D11.S16 + + @ Structure offsets for the FFTSpec + .set ARMsFFTSpec_N, 0 + .set ARMsFFTSpec_pBitRev, 4 + .set ARMsFFTSpec_pTwiddle, 8 + .set ARMsFFTSpec_pBuf, 12 + + .MACRO FFTSTAGE scaled, inverse, name + + @ Read the size from structure and take log + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + + @ Read other structure parameters + LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] + LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] + + MOV size,N,ASR #1 @ preserve the contents of N + MOV step,N,LSL #1 @ step = N/2 * 4 bytes + + @ Process different FFT sizes with different loops. + CMP size,#4 + BLE smallFFTSize\name + + @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} + @ Note: W^(k) is stored as negated value and also need to + @ conjugate the values from the table. + + @ Z(0) : no need of twiddle multiply + @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } + + VLD1 dX0S32[0],[pSrc],step + ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes + + VLD1 dX1S32[0],[pSrc]! + SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1 + + MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes + SUB step1,step1,#4 @ (N/4-1)*4 bytes + + VHADD dY0,dX0,dX1 @ [b+d | a+c] + VHSUB dY1,dX0,dX1 @ [b-d | a-c] + VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] + + .ifeqs "\scaled", "TRUE" + VHSUB dX0,dY0,dY1 + SUBS size,size,#2 + VHADD dX1,dY0,dY1 + .else + VSUB dX0,dY0,dY1 + SUBS size,size,#2 + VADD dX1,dY0,dY1 + .endif + + SUB pSrc,pSrc,step + VST1 dX0[0],[pOut1]! + ADD pTwiddleTmp,pTwiddle,#4 @ W^2 + VST1 dX1[1],[pOut1]! + ADD argTwiddle1,pTwiddle,twStep @ W^1 + + BLT decrementScale\name + BEQ lastElement\name + + SUB step,step,#20 + SUB step1,step1,#4 @ (N/4-1)*8 bytes + SUB step2, step1, #4 + + @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] + @ Note: W^k is stored as negative values in the table and also need to + @ conjugate the values from the table. + @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) + @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1). + +evenOddButterflyLoop\name: + VLD2 {dX0r,dX0i},[pSrc],step + VLD2 {dX1r,dX1i},[pSrc]! + SUB pSrc, pSrc, step + + VLD1 dW0r,[argTwiddle1],step1 + VREV64 qX1,qX1 + VLD1 dW1r,[argTwiddle1]! + VHSUB dT2,dX0r,dX1r @ a-c + SUB argTwiddle1, argTwiddle1, step1 + SUB step1,step1,#16 + + VLD1 dW0i,[pTwiddleTmp],step2 + VHADD dT3,dX0i,dX1i @ b+d + VLD1 dW1i,[pTwiddleTmp]! + VHADD dT0,dX0r,dX1r @ a+c + VHSUB dT1,dX0i,dX1i @ b-d + SUB pTwiddleTmp, pTwiddleTmp, step2 + SUB step2,step2,#16 + + SUBS size,size,#8 + + VZIP dW1r,dW1i + VTRN dW0r,dW0i + VZIP dW1iS32, dW1rS32 + + VMULL qT0,dW1i,dT2 + VMLSL qT0,dW1r,dT3 + VMULL qT1,dW1i,dT3 + VMLAL qT1,dW1r,dT2 + VMULL qT2,dW0r,dT2 + VMLAL qT2,dW0i,dT3 + VMULL qT3,dW0r,dT3 + VMLSL qT3,dW0i,dT2 + + VRSHRN dX1r,qT0,#15 + VRSHRN dX1i,qT1,#15 + VRSHRN dX0r,qT2,#15 + VRSHRN dX0i,qT3,#15 + + .ifeqs "\scaled", "TRUE" + VHADD dY1r,dT0,dX1i @ F(N/2 -1) + VHSUB dY1i,dX1r,dT1 + .else + VADD dY1r,dT0,dX1i @ F(N/2 -1) + VSUB dY1i,dX1r,dT1 + .endif + + .ifeqs "\scaled", "TRUE" + VHADD dY0r,dT0,dX0i @ F(1) + VHSUB dY0i,dT1,dX0r + .else + VADD dY0r,dT0,dX0i @ F(1) + VSUB dY0i,dT1,dX0r + .endif + + VREV64 qY1,qY1 + + VST2 {dY0r,dY0i},[pOut1],step + VST2 {dY1r,dY1i},[pOut1] + ADD pOut1,pOut1,#16 + SUB pOut1, pOut1, step + SUB step,step,#32 + + BGT evenOddButterflyLoop\name + + SUB pSrc,pSrc,#4 @ set both the ptrs to the last element + SUB pOut1,pOut1,#4 + B lastElement\name + +smallFFTSize\name: + @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} + @ Note: W^(k) is stored as negated value and also need to + @ conjugate the values from the table. + + @ Z(0) : no need of twiddle multiply + @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } + + VLD1 dX0S32[0],[pSrc],step + ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes + + VLD1 dX1S32[0],[pSrc]! + SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1 + + MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes + SUB step1,step1,#4 @ (N/4-1)*4 bytes + + VHADD dY0,dX0,dX1 @ [b+d | a+c] + VHSUB dY1,dX0,dX1 @ [b-d | a-c] + VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] + + .ifeqs "\scaled", "TRUE" + VHSUB dX0,dY0,dY1 + SUBS size,size,#2 + VHADD dX1,dY0,dY1 + .else + VSUB dX0,dY0,dY1 + SUBS size,size,#2 + VADD dX1,dY0,dY1 + .endif + + SUB pSrc,pSrc,step + VST1 dX0[0],[pOut1]! + ADD pTwiddleTmp,pTwiddle,#4 @ W^2 + VST1 dX1[1],[pOut1]! + ADD argTwiddle1,pTwiddle,twStep @ W^1 + + BLT decrementScale\name + BEQ lastElement\name + + @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] + @ Note: W^k is stored as negative values in the table and also need to + @ conjugate the values from the table. + @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) + @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1). + + SUB step,step,#12 + +evenOddButterflyLoopSize4\name: + VLD1 dW0rS32[0],[argTwiddle1],step1 + VLD1 dW1rS32[0],[argTwiddle1]! + + VLD2 {dX0r[0],dX0i[0]},[pSrc]! + VLD2 {dX0r[1],dX0i[1]},[pSrc],step + SUB pSrc,pSrc,#4 + SUB argTwiddle1,argTwiddle1,step1 + VLD2 {dX1r[0],dX1i[0]},[pSrc]! + VLD2 {dX1r[1],dX1i[1]},[pSrc]! + + SUB step1,step1,#4 @ (N/4-2)*4 bytes + VLD1 dW0iS32[0],[pTwiddleTmp],step1 + VLD1 dW1iS32[0],[pTwiddleTmp]! + SUB pSrc,pSrc,step + + SUB pTwiddleTmp,pTwiddleTmp,step1 + VREV32 dX1r,dX1r + VREV32 dX1i,dX1i + SUBS size,size,#4 + + VHSUB dT2,dX0r,dX1r @ a-c + VHADD dT3,dX0i,dX1i @ b+d + SUB step1,step1,#4 + VHADD dT0,dX0r,dX1r @ a+c + VHSUB dT1,dX0i,dX1i @ b-d + + VTRN dW1r,dW1i + VTRN dW0r,dW0i + + VMULL qT0,dW1r,dT2 + VMLSL qT0,dW1i,dT3 + VMULL qT1,dW1r,dT3 + VMLAL qT1,dW1i,dT2 + VMULL qT2,dW0r,dT2 + VMLAL qT2,dW0i,dT3 + VMULL qT3,dW0r,dT3 + VMLSL qT3,dW0i,dT2 + + VRSHRN dX1r,qT0,#15 + VRSHRN dX1i,qT1,#15 + + .ifeqs "\scaled", "TRUE" + VHADD dY1r,dT0,dX1i @ F(N/2 -1) + VHSUB dY1i,dX1r,dT1 + .else + VADD dY1r,dT0,dX1i @ F(N/2 -1) + VSUB dY1i,dX1r,dT1 + .endif + + VREV32 dY1r,dY1r + VREV32 dY1i,dY1i + + VRSHRN dX0r,qT2,#15 + VRSHRN dX0i,qT3,#15 + + .ifeqs "\scaled", "TRUE" + VHADD dY0r,dT0,dX0i @ F(1) + VHSUB dY0i,dT1,dX0r + .else + VADD dY0r,dT0,dX0i @ F(1) + VSUB dY0i,dT1,dX0r + .endif + + VST2 {dY0r[0],dY0i[0]},[pOut1]! + VST2 {dY0r[1],dY0i[1]},[pOut1],step + SUB pOut1, #4 + VST2 {dY1r[0],dY1i[0]},[pOut1]! + VST2 {dY1r[1],dY1i[1]},[pOut1]! + SUB pOut1,pOut1,step + SUB pSrc,pSrc,#4 @ set both the ptrs to the last element + SUB pOut1,pOut1,#4 + + @ Last element can be expanded as follows + @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve) + @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] + @ 1/2[2a+j0] - j (c-jd) [0+j2b] + @ (a+bc, -bd) + @ Since (c,d) = (0,1) for the last element, result is just (a,-b) + +lastElement\name: + VLD1 dX0rS32[0],[pSrc] + + .ifeqs "\scaled", "TRUE" + VSHR dX0r,dX0r,#1 + .endif + + VST1 dX0r[0],[pOut1]! + VNEG dX0r,dX0r + VST1 dX0r[1],[pOut1] + +decrementScale\name: + .ifeqs "\scaled", "TRUE" + SUB scale,scale,#1 + .endif + + .endm + + M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4 + FFTSTAGE "FALSE","TRUE",Inv + M_END + + M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4 + FFTSTAGE "TRUE","TRUE",InvSfs + M_END + + + .end diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S index 57fef7a9404..9959f8fdde8 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S @@ -30,8 +30,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S index 323eb8319da..88a08ff3fab 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S @@ -21,8 +21,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S index 02f3888c56f..85b85295076 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S @@ -21,8 +21,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S index 73c1f4b82f3..20c35e15651 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S @@ -21,8 +21,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S index ff62dd132b8..dbe170c62e0 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S @@ -21,8 +21,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S index 9d2e4ab8b44..af86b919a8b 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S @@ -20,8 +20,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S index ae450c5f629..8f63eb8510f 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S @@ -21,8 +21,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S index 4447e76b1f7..19a2f253dc0 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S @@ -20,8 +20,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S index a16c79f75eb..4bdbb52c914 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S index 9f7b531d300..94b3d49e848 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S index 666f4f349a7..2b34d997341 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S index f9bbebcca91..17e0415e822 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S index cdb42a994a1..049621bfabc 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) @@ -142,7 +142,6 @@ RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16 - VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] MOV subFFTSize,#4 @// subFFTSize = 1 for the first stage @@ -158,6 +157,7 @@ grpZeroSetLoop\name: + VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] .ifeqs "\scaled", "TRUE" @@ -178,9 +178,6 @@ grpZeroSetLoop\name: VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1] VHADD qZ0,qY0,qY1 @// y0 - VLD2 {dXr3,dXi3},[pSrc :128],setStep - - .ifeqs "\inverse", "TRUE" VHSUB dZr3,dYr2,dYi3 @// y3 @@ -235,9 +232,6 @@ grpZeroSetLoop\name: VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1] VADD qZ0,qY0,qY1 @// y0 - VLD2 {dXr3,dXi3},[pSrc :128],setStep - - .ifeqs "\inverse", "TRUE" VSUB dZr3,dYr2,dYi3 @// y3 diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S index 23e2c373d62..4e46a010641 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @@ -163,7 +163,6 @@ @// Define stack arguments MOV pw2,pTwiddle - VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]! MOV pw3,pTwiddle MOV pw1,pTwiddle @@ -171,42 +170,47 @@ @// pOut0+outPointStep == increment of 4*outPointStep bytes MOV outPointStep,subFFTSize,LSL #2 - VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]! MOV subFFTNum,#1 @//after the last stage LSL grpCount,subFFTSize,#2 @// Update grpCount and grpSize rightaway - VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]! @// update subFFTSize for the next stage MOV subFFTSize,grpCount MOV dstStep,outPointStep,LSL #1 - VLD2 {dW1r,dW1i}, [pw1 :128]! - - ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16 - VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i - VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i - @// Process 4 groups at a time grpLoop\name: + VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i + VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i + @// Load the second twiddle for 4 groups : w^2 + @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6) for group 0,1,2,3 + VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]! - @// Rearrange the third twiddle - VUZP dW3r,dW3i - SUBS grpCount,grpCount,#16 @// grpCount is multiplied by 4 + VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r + @// Load the third twiddle for 4 groups : w^3 + @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9) for group 0,1,2,3 + VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]! - VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r + + VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]! + VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i + VLD2 {dW1r,dW1i}, [pw1 :128]! + + @// Rearrange the third twiddle + VUZP dW3r,dW3i + SUBS grpCount,grpCount,#16 @// grpCount is multiplied by 4 .ifeqs "\inverse", "TRUE" VMULL qT0,dXr1,dW1r @@ -225,8 +229,6 @@ grpLoop\name: @// Load the first twiddle for 4 groups : w^1 @// w^1 twiddle (i+0,i+1,i+2,i+3) for group 0,1,2,3 - VLD2 {dW1r,dW1i}, [pw1 :128]! - .ifeqs "\inverse", "TRUE" VMULL qT2,dXr2,dW2r VMLAL qT2,dXi2,dW2i @// real part @@ -260,24 +262,12 @@ grpLoop\name: .ENDIF - @// Load the second twiddle for 4 groups : w^2 - @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6) for group 0,1,2,3 - VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]! - - VRSHRN dZr2,qT2,#15 VRSHRN dZi2,qT3,#15 - @// Load the third twiddle for 4 groups : w^3 - @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9) for group 0,1,2,3 - - VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]! - VRSHRN dZr3,qT0,#15 VRSHRN dZi3,qT1,#15 - VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]! - .ifeqs "\scaled", "TRUE" @// finish first stage of 4 point FFT @@ -285,7 +275,6 @@ grpLoop\name: VHADD qY0,qX0,qZ2 VHSUB qY2,qX0,qZ2 VHADD qY1,qZ1,qZ3 - VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i VHSUB qY3,qZ1,qZ3 @@ -293,7 +282,6 @@ grpLoop\name: VHSUB qZ0,qY2,qY1 VHADD qZ2,qY2,qY1 - VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i .ifeqs "\inverse", "TRUE" @@ -329,7 +317,6 @@ grpLoop\name: VADD qY0,qX0,qZ2 VSUB qY2,qX0,qZ2 VADD qY1,qZ1,qZ3 - VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i VSUB qY3,qZ1,qZ3 @@ -337,7 +324,6 @@ grpLoop\name: VSUB qZ0,qY2,qY1 VADD qZ2,qY2,qY1 - VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i .ifeqs "\inverse", "TRUE" @@ -376,7 +362,6 @@ grpLoop\name: @// Reset and Swap pSrc and pDst for the next stage MOV pTmp,pDst - SUB pSrc,pSrc,#64 @// Extra increment currently done in the loop SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= size; pSrc -= 4*size bytes SUB pSrc,pTmp,outPointStep diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S index 0eba3856f2a..7bdbe41e08d 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @@ -154,7 +154,6 @@ MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep VLD1 dW2,[pTwiddle :64] @//[wi | wr] ADD setStep,srcStep,pointStep @// setStep = 3*pointStep - SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16 VLD1 dW3,[pTwiddle :64] @//RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16 RSB setStep,setStep,#0 @// setStep = - 3*pointStep @@ -167,26 +166,23 @@ grpLoop\name: - VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] ADD stepTwiddle,stepTwiddle,pointStep - VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point - VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] MOV twStep,stepTwiddle,LSL #2 - VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & reset pSrc SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle MOV setCount,pointStep,LSR #2 - ADD pSrc,pSrc,#16 @// set pSrc to data[0] of the next set - ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set + ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set @// Loop on the sets : 4 at a time setLoop\name: + VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] + VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] - SUBS setCount,setCount,#4 @// decrement the loop counter + SUBS setCount,setCount,#4 @// decrement the loop counter .ifeqs "\inverse", "TRUE" VMULL qT0,dXr1,dW1[0] @@ -202,8 +198,6 @@ setLoop\name: .ENDIF - VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] - .ifeqs "\inverse", "TRUE" VMULL qT2,dXr2,dW2[0] VMLAL qT2,dXi2,dW2[1] @// real part @@ -218,11 +212,13 @@ setLoop\name: .ENDIF + VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set + VRSHRN dZr1,qT0,#15 VRSHRN dZi1,qT1,#15 - - VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] + VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] + ADD pSrc,pSrc,#16 @// set pSrc to data[1] of the next set .ifeqs "\inverse", "TRUE" VMULL qT0,dXr3,dW3[0] @@ -244,7 +240,6 @@ setLoop\name: VRSHRN dZr3,qT0,#15 VRSHRN dZi3,qT1,#15 - VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set .ifeqs "\scaled", "TRUE" @@ -253,7 +248,6 @@ setLoop\name: VHADD qY0,qX0,qZ2 VHSUB qY2,qX0,qZ2 - VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0] VHADD qY1,qZ1,qZ3 VHSUB qY3,qZ1,qZ3 @@ -303,7 +297,6 @@ setLoop\name: VADD qY0,qX0,qZ2 VSUB qY2,qX0,qZ2 - VLD2 {dXr0,dXi0},[pSrc]! @// data[0] VADD qY1,qZ1,qZ3 VSUB qY3,qZ1,qZ3 @@ -351,7 +344,6 @@ setLoop\name: .ENDIF - ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set BGT setLoop\name VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr] diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S index 588c3197db9..f9ff37a275d 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @@ -233,12 +233,12 @@ VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4] VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5] VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6] - VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set - @// setStep = -7*pointStep + 16 @// grp = 0 a special case since all the twiddle factors are 1 @// Loop on the sets : 4 sets at a time grpZeroSetLoop\name: + VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set + @// setStep = -7*pointStep + 16 @// Decrement setcount SUBS setCount,setCount,#4 @// decrement the set loop counter @@ -348,9 +348,6 @@ grpZeroSetLoop\name: VSUB dVi7,dVi7,dT1 SUB pDst, pDst, step2 @// set pDst to y1 - VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] - - VHSUB dYr3,dVr3,dVr7 VHSUB dYi3,dVi3,dVi7 VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1 @@ -388,7 +385,6 @@ grpZeroSetLoop\name: VSUB dVr5,dT1,dVi5 @// a * V5 VADD dVi5,dT1,dVi5 - VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] VHSUB qY5,qV1,qV5 @@ -514,9 +510,6 @@ grpZeroSetLoop\name: VSUB dVi7,dVi7,dT1 SUB pDst, pDst, step2 @// set pDst to y1 - VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] - - VSUB dYr3,dVr3,dVr7 VSUB dYi3,dVi3,dVi7 VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1 @@ -554,7 +547,6 @@ grpZeroSetLoop\name: VSUB dVr5,dT1,dVi5 @// a * V5 VADD dVi5,dT1,dVi5 - VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] VSUB qY5,qV1,qV5 diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S index 3bc5f02a743..de589c95fa5 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S index 30a8f56b487..eeb8c6eb289 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S @@ -30,8 +30,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S index a9700ec3eab..967d7b59750 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S index 685f85b6f6e..412b64fb59a 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S index 1b5478b2503..91e5299e071 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S @@ -28,8 +28,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S index 3c23983efee..22efea45b0b 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S @@ -30,8 +30,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S index a5fb0e27105..d4d4abb4c21 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S @@ -30,8 +30,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_FC32_Sfs_s.S index da0c10f1f66..aa761126a82 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_FC32_Sfs_s.S @@ -20,8 +20,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC16_Sfs_s.S index ca15c6b06cb..a3c21ac015d 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC16_Sfs_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC32_Sfs_s.S index 90f969a83d5..504ef955d24 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC32_Sfs_s.S @@ -27,8 +27,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S index fda1ae4a16e..fda446cc896 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S @@ -20,8 +20,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S index 84d230036fc..402885fa8fb 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S @@ -28,8 +28,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S new file mode 100644 index 00000000000..e9530774cdf --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S @@ -0,0 +1,639 @@ +@ +@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ +@ Some code in this file was originally from file +@ omxSP_FFTFwd_RToCCS_S32_Sfs_s.S which was licensed as follows. +@ It has been relicensed with permission from the copyright holders. +@ + +@ +@ OpenMAX DL: v1.0.2 +@ Last Modified Revision: 7810 +@ Last Modified Date: Thu, 04 Oct 2007 +@ +@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +@ + +@ +@ Description: +@ Compute a forward FFT for a real signal, using 16 bit complex FFT routines. +@ + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe +.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe + +@Input Registers +#define pSrc r0 +#define pDst r1 +#define pFFTSpec r2 +#define scale r3 + +@ Output registers +#define result r0 + +@Local Scratch Registers +#define argTwiddle r1 +#define argDst r2 +#define argScale r4 +#define pTwiddle r4 +#define tmpOrder r4 +#define pOut r5 +#define subFFTSize r7 +#define subFFTNum r6 +#define N r6 +#define order r14 +#define diff r9 +@ Total num of radix stages to comple the FFT +#define count r8 +#define x0r r4 +#define x0i r5 +#define diffMinusOne r2 +#define round r3 +#define subFFTSizeTmp r6 +#define step r3 +#define stepr r11 +#define step1 r10 +#define step1r r6 +#define step2 r8 +#define step2r r9 +#define twStep r8 +#define zero r9 +#define pTwiddleTmp r5 +#define t0 r10 + +@ Neon registers +#define dX0 d0.s16 +#define dX0S32 d0.s32 +#define dzero d1.s16 +#define dZero d2.s16 +#define dShift d3.s16 +#define qShift q1.s16 +#define dX0r d2.s16 +#define dX0i d3.s16 +#define dX1r d4.s16 +#define dX1i d5.s16 +#define qX1 q2.s16 +#define dX0rS32 d2.s32 +#define dX0iS32 d3.s32 +#define dX1rS32 d4.s32 +#define dX1iS32 d5.s32 +#define dT0 d6.s16 +#define dT1 d7.s16 +#define dT2 d8.s16 +#define dT3 d9.s16 +#define qT0 q5.s32 +#define qT1 q6.s32 +#define qT0s q5.s16 +#define qT1s q6.s16 +#define dW0r d14.s16 +#define dW0i d15.s16 +#define dW1r d16.s16 +#define dW1i d17.s16 +#define dW0rS32 d14.s32 +#define dW0iS32 d15.s32 +#define dW1rS32 d16.s32 +#define dW1iS32 d17.s32 +#define dY0r d14.s16 +#define dY0i d15.s16 +#define dY0rS32 d14.s32 +#define dY0iS32 d15.s32 +#define dY1r d16.s16 +#define dY1i d17.s16 +#define qY1 q8.s16 +#define dY1rS32 d16.s32 +#define dY1iS32 d17.s32 +#define dY0rS64 d14.s32 +#define dY0iS64 d15.s32 +#define qT2 q9.s32 +#define qT3 q10.s32 +#define d18s16 d18.s16 +#define d19s16 d19.s16 +#define d20s16 d20.s16 +#define d21s16 d21.s16 +@ lastThreeelements +#define dX1 d3.s16 +#define dW0 d4.s16 +#define dW1 d5.s16 +#define dY0 d10.s16 +#define dY1 d11.s16 +#define dY2 d12.s16 +#define dY3 d13.s16 + + @ Allocate stack memory required by the function + M_ALLOC4 diffOnStack, 4 + + @ Write function header + M_START omxSP_FFTFwd_RToCCS_S16_Sfs,r11,d15 + + @ Structure offsets for the FFTSpec + .set ARMsFFTSpec_N, 0 + .set ARMsFFTSpec_pBitRev, 4 + .set ARMsFFTSpec_pTwiddle, 8 + .set ARMsFFTSpec_pBuf, 12 + + @ Define stack arguments + + @ Read the size from structure and take log + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + + @ Read other structure parameters + LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] + LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] + + @ Do a N/2 point complex FFT including the scaling + + MOV N,N,ASR #1 @ N/2 point complex FFT + + CLZ order,N @ N = 2^order + RSB order,order,#31 + MOV subFFTSize,#1 + + CMP order,#3 + BGT orderGreaterthan3 @ order > 3 + + CMP order,#1 + BGE orderGreaterthan0 @ order > 0 + M_STR scale, diffOnStack,LT @ order = 0 + LDR x0r,[pSrc] + STR x0r,[pOut] + MOV pSrc,pOut + MOV argDst,pDst + B FFTEnd + +orderGreaterthan0: + @ set the buffers appropriately for various orders + CMP order,#2 + MOVEQ argDst,pDst + MOVNE argDst,pOut + MOVNE pOut,pDst @ Pass 1st stage destination in RN5 + MOV argTwiddle,pTwiddle + + SUBS diff,scale,order + M_STR diff,diffOnStack + MOVGT scale,order + @ Now scale <= order + + CMP order,#1 + BGT orderGreaterthan1 + @ order = 1: + SUBS scale,scale,#1 + BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe + BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe + B FFTEnd + +orderGreaterthan1: + CMP order,#2 + MOV argScale,scale + BGT orderGreaterthan2 + @ order = 2: + SUBS argScale,argScale,#1 + BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe + BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe + SUBS argScale,argScale,#1 + BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe + BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe + B FFTEnd + +orderGreaterthan2: @ order = 3 + SUBS argScale,argScale,#1 + BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe + BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe + SUBS argScale,argScale,#1 + BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe + BLLT armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe + SUBS argScale,argScale,#1 + BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe + BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe + B FFTEnd + + +orderGreaterthan3: + @ check scale = 0 or scale = order + SUBS diff, scale, order @ scale > order + MOVGT scale,order + BGE specialScaleCase @ scale = 0 or scale = order + CMP scale,#0 + BEQ specialScaleCase + B generalScaleCase + +specialScaleCase: @ scale = 0, or, scale = order && order > 3 + TST order, #2 @ Set input args to fft stages + MOVEQ argDst,pDst + MOVNE argDst,pOut + MOVNE pOut,pDst @ Pass the first stage destination in RN5 + MOV argTwiddle,pTwiddle + + CMP diff,#0 + M_STR diff, diffOnStack + BGE scaleEqualsOrder + + @ check for even or odd order. + @ NOTE: The following combination of BL's would work fine even though + @ the first BL would corrupt the flags. This is because the end of the + @ "grpZeroSetLoop" loop inside + @ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ. + + TST order,#0x00000001 + BLEQ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe + BLNE armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe + + CMP subFFTNum,#4 + BLT FFTEnd + +unscaledRadix4Loop: + BEQ lastStageUnscaledRadix4 + BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe + CMP subFFTNum,#4 + B unscaledRadix4Loop + +lastStageUnscaledRadix4: + BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe + B FFTEnd + +scaleEqualsOrder: + @ check for even or odd order + @ NOTE: The following combination of BL's would work fine even though + @ the first BL would corrupt the flags. This is because the end of the + @ "grpZeroSetLoop" loop inside + @ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ. + + TST order,#0x00000001 + BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe + BLNE armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe + + CMP subFFTNum,#4 + BLT FFTEnd + +scaledRadix4Loop: + BEQ lastStageScaledRadix4 + BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe + CMP subFFTNum,#4 + B scaledRadix4Loop + +lastStageScaledRadix4: + BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe + B FFTEnd + +generalScaleCase: @ 0 < scale < order and order > 3 + @ Determine the correct destination buffer + SUB diff,order,scale + TST diff,#0x01 + ADDEQ count,scale,diff,LSR #1 @ count = scale + (order - scale)/2 + MOVNE count,order + TST count,#0x01 @ Is count even or odd ? + + MOVEQ argDst,pDst @ Set input args to fft stages + MOVNE argDst,pOut + MOVNE pOut,pDst @ Pass 1st stage destination in RN5 + MOV argTwiddle,pTwiddle + + CMP diff,#1 + M_STR diff, diffOnStack + BEQ scaleps @ scaling including a radix2_ps stage + + MOV argScale,scale @ Put scale in RN4 to save and restore + BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe + SUBS argScale,argScale,#1 + +scaledRadix2Loop: + BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe + SUBS argScale,argScale,#1 @ save, restore scale in scaled stages + BGT scaledRadix2Loop + B outScale + +scaleps: + SUB argScale,scale,#1 @ order>3 and diff=1 => scale >= 3 + BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe + SUBS argScale,argScale,#1 + +scaledRadix2psLoop: + BEQ scaledRadix2psStage + BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe + SUBS argScale,argScale,#1 @ save, restore scale in scaled stages + BGE scaledRadix2psLoop + +scaledRadix2psStage: + BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe + B generalLastStageUnscaledRadix2 + +outScale: + M_LDR diff, diffOnStack + @check for even or odd order + TST diff,#0x00000001 + BEQ generalUnscaledRadix4Loop + B unscaledRadix2Loop + +generalUnscaledRadix4Loop: + CMP subFFTNum,#4 + BEQ generalLastStageUnscaledRadix4 + BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe + B generalUnscaledRadix4Loop + +generalLastStageUnscaledRadix4: + BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe + B End + +unscaledRadix2Loop: + CMP subFFTNum,#4 + BEQ generalLastTwoStagesUnscaledRadix2 + BL armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe + B unscaledRadix2Loop + +generalLastTwoStagesUnscaledRadix2: + BL armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe +generalLastStageUnscaledRadix2: + BL armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe + B End + +FFTEnd: @ Does only the scaling + M_LDR diff, diffOnStack + CMP diff,#0 + BLE finalComplexToRealFixup + + RSB diff,diff,#0 @ for right shift by a variable + VDUP qShift,diff + + @ save subFFTSize and use subFFTSizeTmp in the following loop + MOV subFFTSizeTmp,subFFTSize @ subFFTSizeTmp same reg as subFFTNum + + @ Use parallel loads for bigger FFT size. + CMP subFFTSizeTmp, #8 + BLT scaleLessFFTData + +scaleFFTData: + VLD1 {qT0s, qT1s},[pSrc:256] @ pSrc contains pDst pointer + SUBS subFFTSizeTmp,subFFTSizeTmp,#8 + VSHL qT0s,qShift + VSHL qT1s,qShift + VST1 {qT0s, qT1s},[pSrc:256]! + BGT scaleFFTData + B afterScaling + +scaleLessFFTData: + VLD1 {dX0S32[0]},[pSrc] @ pSrc contains pDst pointer + SUBS subFFTSizeTmp,subFFTSizeTmp,#1 + VSHL dX0,dShift + VST1 {dX0S32[0]},[pSrc]! + BGT scaleLessFFTData + +afterScaling: + SUB pSrc,pSrc,subFFTSize,LSL #2 @ reset pSrc for final fixup + + @ change the logic so that output after scaling is in pOut and not in pDst + @ finally store from pOut to pDst + @ change branch "End" to branch "finalComplexToRealFixup" in the above + @ chk the code below for multiplication by j factor + +finalComplexToRealFixup: + @ F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] + @ 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] + @ 1/2[2a+j0] - j [0+j2b] + @ (a+b, 0) + + @ F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] + @ 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] + @ 1/2[2a+j0] + j [0+j2b] + @ (a-b, 0) + + CMP subFFTSize,#4 + BLE smallFFTSize + +@ SubSize > 3: + @ F(0) and F(N/2) + VLD2 {dX0r[0],dX0i[0]},[pSrc]! + MOV zero,#0 + VMOV dX0r[1],zero + MOV step,subFFTSize,LSL #2 @ step = N/2 * 4 bytes + VMOV dX0i[1],zero + SUB twStep,step,subFFTSize @ twStep = 3N/8 * 8 bytes + + VADD dY0r,dX0r,dX0i @ F(0) = ((Z0.r+Z0.i) , 0) + MOV step1,subFFTSize,LSL #1 @ step1 = N/2 * 2 bytes + VSUB dY0i,dX0r,dX0i @ F(N/2) = ((Z0.r-Z0.i) , 0) + SUBS subFFTSize,subFFTSize,#2 + + VST1 dY0rS32[0],[argDst], step + ADD pTwiddleTmp,argTwiddle,#4 @ W^2 + VST1 dY0iS32[0],[argDst]! + ADD argTwiddle,argTwiddle,twStep @ W^1 + + VDUP dzero,zero + SUB argDst,argDst,step + SUB step,step,#20 + RSB stepr, step, #16 + SUB step1,step1,#8 @ (N/4-1)*8 bytes + RSB step1r,step1,#8 + + SUB step2, step1, #4 + RSB step2r, step2, #8 + + @ F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] + @ Note: W^k is stored as negative values in the table. + @ Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) + @ since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1). + +evenOddButterflyLoop: + VLD2 {dX0r,dX0i},[pSrc],step + VLD2 {dX1r,dX1i},[pSrc],stepr + + VLD1 dW0r,[argTwiddle],step1 + SUB step1, step1, #16 + VREV64 qX1,qX1 + + VLD1 dW1r,[argTwiddle],step1r + ADD step1r, step1r, #16 + VSUB dT2,dX0r,dX1r @ a-c + + VLD1 dW0i,[pTwiddleTmp],step2 + SUB step2, step2, #16 + VADD dT3,dX0i,dX1i @ b+d + + VLD1 dW1i,[pTwiddleTmp],step2r + ADD step2r, step2r, #16 + + VTRN dW0r,dW0i + VZIP dW1r, dW1i + + SUBS subFFTSize,subFFTSize,#8 + + VHADD dT0,dX0r,dX1r @ (a+c)/2 + VZIP dW1iS32, dW1rS32 + VHSUB dT1,dX0i,dX1i @ (b-d)/2 + + VQDMULH dY0,dW1i,dT2 + VQDMULH dY1,dW1r,dT3 + VQDMULH dY2,dW1i,dT3 + VQDMULH dY3,dW1r,dT2 + + VQDMULH d18s16,dW0r,dT2 + VQDMULH d19s16,dW0i,dT3 + VQDMULH d20s16,dW0r,dT3 + VQDMULH d21s16,dW0i,dT2 + + VRHADD dX1r, dY0, dY1 + VHSUB dX1i, dY2, dY3 + VHSUB dX0r, d18s16, d19s16 + VADD dY1i,dT1,dX1r + VRHADD dX0i, d20s16, d21s16 + VSUB dY1r,dT0,dX1i @ F(N/2 -1) + VSUB dY0r,dT0,dX0i @ F(1) + VADD dY0i,dT1,dX0r + + VNEG dY1i,dY1i + VREV64 qY1, qY1 + + VST2 {dY0r,dY0i},[argDst],step + SUB step,step,#32 @ (N/2-4)*4 bytes + VST2 {dY1r,dY1i},[argDst],stepr + ADD stepr,stepr,#32 + + BGT evenOddButterflyLoop + + SUB pSrc,pSrc,#4 @ points to the last element. + SUB argDst,argDst,#4 @ points to the last element. + + b lastElement + +smallFFTSize: + + @ F(0) and F(N/2) + VLD2 {dX0r[0],dX0i[0]},[pSrc]! + MOV zero,#0 + VMOV dX0r[1],zero + MOV step,subFFTSize,LSL #2 @ step = N/2 * 4 bytes + VMOV dX0i[1],zero + SUB twStep,step,subFFTSize @ twStep = 3N/8 * 8 bytes + + VADD dY0r,dX0r,dX0i @ F(0) = ((Z0.r+Z0.i) , 0) + MOV step1,subFFTSize,LSL #1 @ step1 = N/2 * 2 bytes + VSUB dY0i,dX0r,dX0i @ F(N/2) = ((Z0.r-Z0.i) , 0) + SUBS subFFTSize,subFFTSize,#2 + + + VST1 dY0rS32[0],[argDst], step + ADD pTwiddleTmp,argTwiddle,#4 @ W^2 + VST1 dY0iS32[0],[argDst]! + ADD argTwiddle,argTwiddle,twStep @ W^1 + + VDUP dzero,zero + SUB argDst,argDst,step + + BLT End + BEQ lastElement + + SUB step,step,#12 + SUB step1,step1,#4 @ (N/4-1)*8 bytes + + @ F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] + +butterflyLoopSubFFTSize4: + VLD1 dW0rS32[0], [argTwiddle],step1 + VLD1 dW1rS32[0],[argTwiddle]! + + VLD2 {dX0r[0],dX0i[0]},[pSrc]! + VLD2 {dX0r[1],dX0i[1]},[pSrc],step + SUB pSrc,pSrc,#4 + SUB argTwiddle,argTwiddle,step1 + VLD2 {dX1r[0],dX1i[0]},[pSrc]! + VLD2 {dX1r[1],dX1i[1]},[pSrc]! + + SUB step1,step1,#4 @ (N/4-2)*4 bytes + VLD1 dW0iS32[0],[pTwiddleTmp],step1 + VLD1 dW1iS32[0],[pTwiddleTmp]! + SUB pSrc,pSrc,step + + SUB pTwiddleTmp,pTwiddleTmp,step1 + VREV32 dX1r,dX1r + VREV32 dX1i,dX1i + SUBS subFFTSize,subFFTSize,#4 + + VSUB dT2,dX0r,dX1r @ a-c + SUB step1,step1,#4 + VADD dT3,dX0i,dX1i @ b+d + VADD dT0,dX0r,dX1r @ a+c + VSUB dT1,dX0i,dX1i @ b-d + VHADD dT0,dT0,dzero + VHADD dT1,dT1,dzero + + VTRN dW1r,dW1i + VTRN dW0r,dW0i + + VMULL qT0,dW1r,dT2 + VMLAL qT0,dW1i,dT3 + VMULL qT1,dW1r,dT3 + VMLSL qT1,dW1i,dT2 + + VMULL qT2,dW0r,dT2 + VMLSL qT2,dW0i,dT3 + VMULL qT3,dW0r,dT3 + VMLAL qT3,dW0i,dT2 + + VRSHRN dX1r,qT0,#16 + VRSHRN dX1i,qT1,#16 + + VSUB dY1r,dT0,dX1i @ F(N/2 -1) + VADD dY1i,dT1,dX1r + VNEG dY1i,dY1i + + VREV32 dY1r,dY1r + VREV32 dY1i,dY1i + + VRSHRN dX0r,qT2,#16 + VRSHRN dX0i,qT3,#16 + + VSUB dY0r,dT0,dX0i @ F(1) + VADD dY0i,dT1,dX0r + + VST2 {dY0r[0],dY0i[0]},[argDst]! + VST2 {dY0r[1],dY0i[1]},[argDst],step + SUB argDst, #4 + VST2 {dY1r[0],dY1i[0]},[argDst]! + VST2 {dY1r[1],dY1i[1]},[argDst]! + SUB argDst,argDst,step + SUB pSrc,pSrc,#4 @ points to the last element. + SUB argDst,argDst,#4 @ points to the last element. + +lastElement: + @ Last element can be expanded as follows + @ 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] + @ 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] + @ 1/2[2a+j0] + j (c+jd) [0+j2b] + @ (a-bc, -bd) + @ Since (c,d) = (0,1) for the last element, result is just (a,-b) + + VLD1 dX0rS32[0],[pSrc] + VST1 dX0r[0],[argDst]! + VNEG dX0r,dX0r + VST1 dX0r[1],[argDst]! + +End: + @ Set return value + MOV result, #OMX_Sts_NoErr + + @ Write function tail + M_END + + .END diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S index a742162e616..c1385c025ed 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_F32_Sfs_s.S index 5deaf896c53..9c45b54cdc1 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_F32_Sfs_s.S @@ -20,8 +20,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S new file mode 100644 index 00000000000..311dba99e83 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S @@ -0,0 +1,301 @@ +@ +@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ +@ Some code in this file was originally from file +@ omxSP_FFTInv_CToC_SC16_Sfs_s.S which was licensed as follows. +@ It has been relicensed with permission from the copyright holders. +@ + +@ +@ File Name: omxSP_FFTInv_CToC_SC16_Sfs_s.s +@ OpenMAX DL: v1.0.2 +@ Last Modified Revision: 6729 +@ Last Modified Date: Tue, 17 Jul 2007 +@ +@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +@ + +@ +@ Description: +@ Compute an inverse FFT for a 16-bit real signal, with complex FFT routines. +@ + +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" + +.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe +.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe + +@Input Registers +#define pSrc r0 +#define pDst r1 +#define pFFTSpec r2 +#define scale r3 + +@ Output registers +#define result r0 + +@Local Scratch Registers +#define argTwiddle r1 +#define argDst r2 +#define argScale r4 +#define pTwiddle r4 +#define tmpOrder r4 +#define pOut r5 +#define subFFTSize r7 +#define subFFTNum r6 +#define N r6 +#define order r14 +#define diff r9 +@ Total num of radix stages to comple the FFT +#define count r8 +#define x0r r4 +#define x0i r5 +#define diffMinusOne r2 +#define round r3 +#define pOut1 r2 +#define size r7 +#define step r8 +#define step1 r9 +#define twStep r10 +#define pTwiddleTmp r11 +#define argTwiddle1 r12 +#define zero r14 + +@ Neon registers +#define dX0 D0.S32 +#define dShift D1.S32 +#define qShift Q0.s16 +#define dX1 D1.S32 +#define dY0 D2.S32 +#define dY1 D3.S32 +#define dX0r D0.S32 +#define dX0i D1.S32 +#define dX1r D2.S32 +#define dX1i D3.S32 +#define dW0r D4.S32 +#define dW0i D5.S32 +#define dW1r D6.S32 +#define dW1i D7.S32 +#define dT0 D8.S32 +#define dT1 D9.S32 +#define dT2 D10.S32 +#define dT3 D11.S32 +#define qT0 Q6.S64 +#define qT1 Q7.S64 +#define qT0s Q6.S16 +#define qT1s Q7.S16 +#define qT2 Q8.S64 +#define qT3 Q9.S64 +#define dY0r D4.S32 +#define dY0i D5.S32 +#define dY1r D6.S32 +#define dY1i D7.S32 +#define dzero D20.S32 +#define dY2 D4.S32 +#define dY3 D5.S32 +#define dW0 D6.S32 +#define dW1 D7.S32 +#define dW0Tmp D10.S32 +#define dW1Neg D11.S32 + + + + @ Allocate stack memory required by the function + M_ALLOC4 diffOnStack, 4 + + @ Write function header + M_START omxSP_FFTInv_CCSToR_S16_Sfs,r11,d15 + +@ Structure offsets for the FFTSpec + .set ARMsFFTSpec_N, 0 + .set ARMsFFTSpec_pBitRev, 4 + .set ARMsFFTSpec_pTwiddle, 8 + .set ARMsFFTSpec_pBuf, 12 + + @ Define stack arguments + + @ Read the size from structure and take log + LDR N, [pFFTSpec, #ARMsFFTSpec_N] + + @ Read other structure parameters + LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] + LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] + + @ Call the preTwiddle Radix2 stage before doing the complex IFFT + + @ The following conditional BL combination would work since + @ evenOddButterflyLoop in the first call would set Z flag to zero + + CMP scale,#0 + BLEQ armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe + BLGT armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe + +complexIFFT: + + ASR N,N,#1 @ N/2 point complex IFFT + ADD pSrc,pOut,N,LSL #2 @ set pSrc as pOut1 + + CLZ order,N @ N = 2^order + RSB order,order,#31 + MOV subFFTSize,#1 + + ADD scale,scale,order @ FFTInverse has a final scaling factor by N + + CMP order,#3 + BGT orderGreaterthan3 @ order > 3 + + CMP order,#1 + BGE orderGreaterthan0 @ order > 0 + M_STR scale, diffOnStack,LT @ order = 0 + LDRLT x0r,[pSrc] + STRLT x0r,[pDst] + MOVLT pSrc,pDst + BLT FFTEnd + +orderGreaterthan0: + @ set the buffers appropriately for various orders + CMP order,#2 + MOVNE argDst,pDst + MOVEQ argDst,pOut + MOVEQ pOut,pDst @ Pass the first stage destination in RN5 + MOV argTwiddle,pTwiddle + @ Store the scale factor and scale at the end + SUB diff,scale,order + M_STR diff, diffOnStack + BGE orderGreaterthan1 + BLLT armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @ order = 1 + B FFTEnd + + +orderGreaterthan1: + MOV tmpOrder,order @ tmpOrder = RN 4 + BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe + CMP tmpOrder,#2 + BLGT armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe + BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe + B FFTEnd + + + + +orderGreaterthan3: + @ check scale = 0 or scale = order + SUB diff, scale, order @ scale > order + + TST order, #2 @ Set input args to fft stages + MOVNE argDst,pDst + MOVEQ argDst,pOut + MOVEQ pOut,pDst @ Pass the first stage destination in RN5 + MOV argTwiddle,pTwiddle + + CMP diff,#0 + M_STR diff, diffOnStack + BGE scaleEqualsOrder + + @check for even or odd order + @ NOTE: The following combination of BL's would work fine eventhough the first + @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside + @ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ + + TST order,#0x00000001 + BLEQ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe + BLNE armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe + + CMP subFFTNum,#4 + BLT FFTEnd + +unscaledRadix4Loop: + BEQ lastStageUnscaledRadix4 + BL armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe + CMP subFFTNum,#4 + B unscaledRadix4Loop + +lastStageUnscaledRadix4: + BL armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe + B FFTEnd + +scaleEqualsOrder: + @check for even or odd order + @ NOTE: The following combination of BL's would work fine eventhough the first + @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside + @ armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ + + TST order,#0x00000001 + BLEQ armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe + BLNE armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe + + CMP subFFTNum,#4 + BLT FFTEnd + +scaledRadix4Loop: + BEQ lastStageScaledRadix4 + BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe + CMP subFFTNum,#4 + B scaledRadix4Loop + +lastStageScaledRadix4: + BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe + +FFTEnd: @ Does only the scaling + + M_LDR diff, diffOnStack + CMP diff,#0 + BLE End + + RSB diff,diff,#0 @ to use VRSHL for right shift by a variable + VDUP qShift,diff + + @ Use parallel loads for bigger FFT size. + CMP subFFTSize, #8 + BLT scaleLessFFTData + +scaleFFTData: + VLD1 {qT0s, qT1s},[pSrc:256] @ pSrc contains pDst pointer + SUBS subFFTSize,subFFTSize,#8 + VSHL qT0s,qShift + VSHL qT1s,qShift + VST1 {qT0s, qT1s},[pSrc:256]! + BGT scaleFFTData + B End + +scaleLessFFTData: @ N = subFFTSize ; dataptr = pDst ; scale = diff + VLD1 {dX0[0]},[pSrc] @ pSrc contains pDst pointer + SUBS subFFTSize,subFFTSize,#1 + VRSHL dX0,dShift + VST1 {dX0[0]},[pSrc]! + BGT scaleLessFFTData + +End: + @ Set return value + MOV result, #OMX_Sts_NoErr + + @ Write function tail + M_END + + + + + + + .END diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S index becc0327e7f..f2f2d025d22 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32_Sfs_s.S index 003d666036d..10ce047dbff 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32_Sfs_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_FC32_Sfs_s.S index c2e86d2f7e8..73a6549f00c 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_FC32_Sfs_s.S @@ -20,8 +20,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC16_Sfs_s.S index ff85e2b5af6..2388d0f5811 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC16_Sfs_s.S @@ -29,8 +29,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC32_Sfs_s.S index 09c461cc78f..7df624301c3 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC32_Sfs_s.S @@ -28,8 +28,8 @@ @// Include standard headers -#include "dl/api/armCOMM_s.h" -#include "dl/api/omxtypes_s.h" +#include "dl/api/arm/armCOMM_s.h" +#include "dl/api/arm/omxtypes_s.h" @// Import symbols required from other files @// (For example tables) diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_FC32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c index 081f23739dd..6ac9de85a90 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_FC32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC16.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c index 288c76ca614..1fc4fe2bd6f 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC16.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c @@ -25,7 +25,7 @@ * Compute the size of the specification structure required */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c index 0ca3b5664b4..176586407cb 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c @@ -25,7 +25,7 @@ * Compute the size of the specification structure required */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_F32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_F32.c index 19b16bbd959..046d069d06e 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_F32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_F32.c @@ -9,7 +9,7 @@ * */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16.c new file mode 100644 index 00000000000..7ad27500dc0 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * Some code in this file was originally from file omxSP_FFTGetBufSize_R_S32.c + * which was licensed as follows. + * It has been relicensed with permission from the copyright holders. + */ + +/* + * OpenMAX DL: v1.0.2 + * Last Modified Revision: + * Last Modified Date: + */ + +#include "dl/api/arm/armOMX.h" +#include "dl/api/omxtypes.h" +#include "dl/sp/api/armSP.h" +#include "dl/sp/api/omxSP.h" + +/** + * Function: omxSP_FFTGetBufSize_R_S16 + * + * Description: + * Computes the size of the specification structure required for the length + * 2^order real FFT and IFFT functions. + * + * Remarks: + * This function is used in conjunction with the 16-bit functions + * <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>. + * + * Parameters: + * [in] order base-2 logarithm of the length; valid in the range + * [1,12]. + * [out] pSize pointer to the number of bytes required for the + * specification structure. + * + * Return Value: + * Standard omxError result. See enumeration for possible result codes. + * + */ + +OMXResult omxSP_FFTGetBufSize_R_S16(OMX_INT order, OMX_INT *pSize) { + OMX_INT NBy2,N,twiddleSize; + + /* Order zero not allowed */ + if (order == 0) { + return OMX_Sts_BadArgErr; + } + + NBy2 = 1 << (order - 1); + N = NBy2 << 1; + twiddleSize = 5 * N / 8; /* 3 / 4 (N / 2) + N / 4 */ + + /* 2 pointers to store bitreversed array and twiddle factor array */ + *pSize = sizeof(ARMsFFTSpec_R_SC16) + /* Twiddle factors */ + + sizeof(OMX_SC16) * twiddleSize + /* Ping Pong buffer for doing the N/2 point complex FFT; */ + /* extra size 'N' as a temporary buf for FFTInv_CCSToR_S16_Sfs */ + + sizeof(OMX_S16) * (N << 1) + /* Extra bytes to get 32 byte alignment of ptwiddle and pBuf */ + + 62 ; + + + return OMX_Sts_NoErr; +} + +/***************************************************************************** + * END OF FILE + *****************************************************************************/ + diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S16S32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c index 846536386d9..6ebdae10c86 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S16S32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c @@ -25,7 +25,7 @@ * Computes the size of the specification structure required. */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S32.c index d57294700e8..d5758d0a7ee 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S32.c @@ -25,7 +25,7 @@ * Computes the size of the specification structure required. */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_FC32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_FC32.c index cc53c5912f1..4a68b6f6b76 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_FC32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_FC32.c @@ -11,7 +11,7 @@ * complex float instead of SC32. */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC16.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC16.c index f8248bbbf0b..0a23b8b7651 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC16.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC16.c @@ -25,7 +25,7 @@ * Initializes the specification structures required */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC32.c index 9ea103f3d68..0b4b5371d5e 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC32.c @@ -25,7 +25,7 @@ * Initializes the specification structures required */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_F32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_F32.c index 32d22230ed7..b5067833517 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_F32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_F32.c @@ -11,7 +11,7 @@ * instead of S32. */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16.c new file mode 100644 index 00000000000..e3fc2719e4d --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * Some code in this file was originally from file omxSP_FFTInit_R_S16S32.c + * which was licensed as follows. + * It has been relicensed with permission from the copyright holders. + */ + +/* + * OpenMAX DL: v1.0.2 + * Last Modified Revision: + * Last Modified Date: + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + */ + +#include "dl/api/arm/armOMX.h" +#include "dl/api/omxtypes.h" +#include "dl/sp/api/armSP.h" +#include "dl/sp/api/omxSP.h" + +/** + * Function: omxSP_FFTInit_R_S16 + * + * Description: + * Initialize the real forward-FFT specification information struct. + * + * Remarks: + * This function is used to initialize the specification structures + * for functions <ippsFFTFwd_RToCCS_S16_Sfs> and + * <ippsFFTInv_CCSToR_S16_Sfs>. Memory for *pFFTSpec must be + * allocated prior to calling this function. The number of bytes + * required for *pFFTSpec can be determined using + * <FFTGetBufSize_R_S16>. + * + * Parameters: + * [in] order base-2 logarithm of the desired block length; + * valid in the range [1,12]. + * [out] pFFTFwdSpec pointer to the initialized specification structure. + * + * Return Value: + * Standard omxError result. See enumeration for possible result codes. + * + */ + +OMXResult omxSP_FFTInit_R_S16(OMXFFTSpec_R_S16* pFFTSpec, OMX_INT order) { + OMX_INT i = 0, j = 0; + OMX_SC16 *pTwiddle = NULL, *pTwiddle1 = NULL, *pTwiddle2 = NULL; + OMX_SC16 *pTwiddle3 = NULL, *pTwiddle4 = NULL; + OMX_S16 *pBuf = NULL; + OMX_U16 *pBitRev = NULL; + OMX_U32 pTmp = 0; + OMX_INT Nby2 = 0, N = 0, M = 0, diff = 0, step = 0; + OMX_S16 x = 0, y = 0, xNeg = 0; + OMX_S32 xS32 = 0, yS32 = 0; + ARMsFFTSpec_R_SC16 *pFFTStruct = NULL; + + /* Order zero not allowed */ + if (order == 0) { + return OMX_Sts_BadArgErr; + } + + /* Do the initializations */ + pFFTStruct = (ARMsFFTSpec_R_SC16*) pFFTSpec; + Nby2 = 1 << (order - 1); + N = Nby2 << 1; + pBitRev = NULL ; /* optimized implementations don't use bitreversal */ + pTwiddle = (OMX_SC16*) (sizeof(ARMsFFTSpec_R_SC16) + (OMX_S8*)pFFTSpec); + + /* Align to 32 byte boundary */ + pTmp = ((OMX_U32)pTwiddle)&31; /* (OMX_U32)pTwiddle % 32 */ + if(pTmp != 0) { + pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32 - pTmp)); + } + + pBuf = (OMX_S16*) (sizeof(OMX_SC16) * (5 * N / 8) + (OMX_S8*)pTwiddle); + + /* Align to 32 byte boundary */ + pTmp = ((OMX_U32)pBuf)&31; /* (OMX_U32)pBuf % 32 */ + if(pTmp != 0) { + pBuf = (OMX_S16*)((OMX_S8*)pBuf + (32 - pTmp)); + } + + /* + * Filling Twiddle factors : exp^(-j*2*PI*k/ (N/2) ) ; k=0,1,2,...,3/4(N/2). + * N/2 point complex FFT is used to compute N point real FFT. + * The original twiddle table "armSP_FFT_S32TwiddleTable" is of size + * (MaxSize/8 + 1). Rest of the values i.e., up to MaxSize are calculated + * using the symmetries of sin and cos. + * The max size of the twiddle table needed is 3/4(N/2) for a radix-4 stage. + * + * W = (-2 * PI) / N + * N = 1 << order + * W = -PI >> (order - 1) + * + * Note we use S32 twiddle factor table and round the values to 16 bits. + */ + + M = Nby2 >> 3; + diff = 12 - (order - 1); + step = 1 << diff; /* Step into the twiddle table for the current order */ + + xS32 = armSP_FFT_S32TwiddleTable[0]; + yS32 = armSP_FFT_S32TwiddleTable[1]; + x = (xS32 + 0x8000) >> 16; + y = (yS32 + 0x8000) >> 16; + xNeg = 0x7FFF; + + if((order-1) >= 3) { + /* i = 0 case */ + pTwiddle[0].Re = x; + pTwiddle[0].Im = y; + pTwiddle[2 * M].Re = -y; + pTwiddle[2 * M].Im = xNeg; + pTwiddle[4 * M].Re = xNeg; + pTwiddle[4 * M].Im = y; + + for (i=1; i<=M; i++){ + OMX_S16 x_neg = 0, y_neg = 0; + j = i * step; + + xS32 = armSP_FFT_S32TwiddleTable[2 * j]; + yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1]; + x = (xS32 + 0x8000) >> 16; + y = (yS32 + 0x8000) >> 16; + /* |x_neg = -x| doesn't work when x is 0x8000. */ + x_neg = (-(xS32 + 0x8000)) >> 16; + y_neg = (-(yS32 + 0x8000)) >> 16; + + pTwiddle[i].Re = x; + pTwiddle[i].Im = y; + pTwiddle[2 * M - i].Re = y_neg; + pTwiddle[2 * M - i].Im = x_neg; + pTwiddle[2 * M + i].Re = y; + pTwiddle[2 * M + i].Im = x_neg; + pTwiddle[4 * M - i].Re = x_neg; + pTwiddle[4 * M - i].Im = y; + pTwiddle[4 * M + i].Re = x_neg; + pTwiddle[4 * M + i].Im = y_neg; + pTwiddle[6 * M - i].Re = y; + pTwiddle[6 * M - i].Im = x; + } + } + else { + if ((order - 1) == 2) { + pTwiddle[0].Re = x; + pTwiddle[0].Im = y; + pTwiddle[1].Re = -y; + pTwiddle[1].Im = xNeg; + pTwiddle[2].Re = xNeg; + pTwiddle[2].Im = y; + } + if ((order-1) == 1) { + pTwiddle[0].Re = x; + pTwiddle[0].Im = y; + } + } + + /* + * Now fill the last N/4 values : exp^(-j*2*PI*k/N); k=1,3,5,...,N/2-1. + * These are used for the final twiddle fix-up for converting complex to + * real FFT. + */ + + M = N >> 3; + diff = 12 - order; + step = 1 << diff; + + pTwiddle1 = pTwiddle + 3 * N / 8; + pTwiddle4 = pTwiddle1 + (N / 4 - 1); + pTwiddle3 = pTwiddle1 + N / 8; + pTwiddle2 = pTwiddle1 + (N / 8 - 1); + + xS32 = armSP_FFT_S32TwiddleTable[0]; + yS32 = armSP_FFT_S32TwiddleTable[1]; + x = (xS32 + 0x8000) >> 16; + y = (yS32 + 0x8000) >> 16; + xNeg = 0x7FFF; + + if((order) >= 3) { + for (i = 1; i <= M; i += 2 ) { + OMX_S16 x_neg = 0, y_neg = 0; + + j = i*step; + + xS32 = armSP_FFT_S32TwiddleTable[2 * j]; + yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1]; + x = (xS32 + 0x8000) >> 16; + y = (yS32 + 0x8000) >> 16; + /* |x_neg = -x| doesn't work when x is 0x8000. */ + x_neg = (-(xS32 + 0x8000)) >> 16; + y_neg = (-(yS32 + 0x8000)) >> 16; + + pTwiddle1[0].Re = x; + pTwiddle1[0].Im = y; + pTwiddle1 += 1; + pTwiddle2[0].Re = y_neg; + pTwiddle2[0].Im = x_neg; + pTwiddle2 -= 1; + pTwiddle3[0].Re = y; + pTwiddle3[0].Im = x_neg; + pTwiddle3 += 1; + pTwiddle4[0].Re = x_neg; + pTwiddle4[0].Im = y; + pTwiddle4 -= 1; + } + } + else { + if (order == 2) { + pTwiddle1[0].Re = -y; + pTwiddle1[0].Im = xNeg; + } + } + + /* Update the structure */ + pFFTStruct->N = N; + pFFTStruct->pTwiddle = pTwiddle; + pFFTStruct->pBitRev = pBitRev; + pFFTStruct->pBuf = pBuf; + + return OMX_Sts_NoErr; +} +/***************************************************************************** + * END OF FILE + *****************************************************************************/ + diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S16S32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16S32.c index d157b3457c4..9a66430c2df 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S16S32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16S32.c @@ -25,7 +25,7 @@ * Initialize the real forward-FFT specification information struct. */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S32.c index 337f2a20b28..d55ab065095 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S32.c +++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S32.c @@ -25,7 +25,7 @@ * Initialize the real forward-FFT specification information struct. */ -#include "dl/api/armOMX.h" +#include "dl/api/arm/armOMX.h" #include "dl/api/omxtypes.h" #include "dl/sp/api/armSP.h" #include "dl/sp/api/omxSP.h" diff --git a/chromium/third_party/openmax_dl/dl/sp/src/test/test_fft.gyp b/chromium/third_party/openmax_dl/dl/sp/src/test/test_fft.gyp index 99b3774324f..99280b59c2d 100644 --- a/chromium/third_party/openmax_dl/dl/sp/src/test/test_fft.gyp +++ b/chromium/third_party/openmax_dl/dl/sp/src/test/test_fft.gyp @@ -17,7 +17,7 @@ ], 'dependencies' : [ '../../../dl.gyp:openmax_dl', - 'test_utilities' + 'test_utilities', ], 'conditions': [ ['big_float_fft == 1', { @@ -27,7 +27,110 @@ }], ], }, + 'conditions': [ + ['target_arch == "arm"', { + # Test programs supported on ARM + 'targets': [ + { + # Test complex fixed-point 16-bit FFT + 'target_name': 'test_fft16', + 'type': 'executable', + 'sources': [ + 'test_fft16.c', + ], + }, + { + # Test complex fixed-point 32-bit FFT + 'target_name': 'test_fft32', + 'type': 'executable', + 'sources': [ + 'test_fft32.c', + ], + }, + { + # Test real 32-bit fixed-point FFT + 'target_name': 'test_rfft32', + 'type': 'executable', + 'sources': [ + 'test_rfft32.c', + ], + }, + { + # Test real 16-bit fixed-point FFT implemented with S32 routines. + 'target_name': 'test_rfft16_s32', + 'type': 'executable', + 'sources': [ + 'test_rfft16_s32.c', + ], + }, + { + # Test real 16-bit fixed-point FFT implemented with S16 routines. + 'target_name': 'test_rfft16_s16', + 'type': 'executable', + 'sources': [ + 'test_rfft16_s16.c', + ], + }, + { + # Test complex floating-point FFT + 'target_name': 'test_float_fft', + 'type': 'executable', + 'sources': [ + 'test_float_fft.c', + 'support/float_fft_neon.c', + ], + }, + # Non-NEON test programs + { + # Test complex floating-point FFT, non-NEON + 'target_name': 'test_float_fft_armv7', + 'type': 'executable', + 'defines': [ + 'ARM_VFP_TEST' + ], + 'sources': [ + 'test_float_fft.c', + 'support/float_fft_armv7.c', + ], + }, + { + # Test real floating-point FFT, non-NEON + 'target_name': 'test_float_rfft_armv7', + 'type': 'executable', + 'sources': [ + 'test_float_rfft.c', + 'support/float_rfft_armv7.c', + 'support/float_rfft_thresholds.h', + ], + }, + { + # Test real floating-point FFT, detecting NEON support + 'target_name': 'test_float_rfft_detect', + 'type': 'executable', + 'sources': [ + 'test_float_rfft.c', + 'support/float_rfft_detect.c', + 'support/float_rfft_thresholds.h', + ], + }, + { + # Simple timing test of FFTs, non-NEON + 'target_name': 'test_fft_time_armv7', + 'type': 'executable', + 'defines': [ + # Timing test for non-NEON is only supported for float FFTs. + 'ARM_VFP_TEST', + 'FLOAT_ONLY', + ], + 'sources': [ + 'test_fft_time.c', + ], + }, + ], + }], + ], 'targets': [ + # Targets that should be supported by all architectures { # Test utilities 'target_name': 'test_utilities', @@ -43,51 +146,24 @@ ], }, { - # Test complex fixed-point 16-bit FFT - 'target_name': 'test_fft16', - 'type': 'executable', - 'sources': [ - 'test_fft16.c', - ], - }, - { - # Test complex fixed-point 32-bit FFT - 'target_name': 'test_fft32', - 'type': 'executable', - 'sources': [ - 'test_fft32.c', - ], - }, - { - # Test real 32-bit fixed-point FFT - 'target_name': 'test_rfft32', - 'type': 'executable', - 'sources': [ - 'test_rfft32.c', - ], - }, - { - # Test real 16-bit fixed-point FFT - 'target_name': 'test_rfft16', - 'type': 'executable', - 'sources': [ - 'test_rfft16.c', - ], - }, - { - # Test complex floating-point FFT - 'target_name': 'test_float_fft', - 'type': 'executable', - 'sources': [ - 'test_float_fft.c', - ], - }, - { # Test real floating-point FFT 'target_name': 'test_float_rfft', 'type': 'executable', 'sources': [ 'test_float_rfft.c', + 'support/float_rfft_thresholds.h', + ], + 'conditions': [ + ['target_arch == "arm"', { + 'sources': [ + 'support/float_rfft_neon.c', + ], + }], + ['target_arch == "ia32"', { + 'sources': [ + 'support/float_rfft_x86.c', + ], + }], ], }, { @@ -97,18 +173,42 @@ 'sources': [ 'test_fft_time.c', ], + 'conditions': [ + ['target_arch == "ia32"', { + 'defines': [ + # Timing test only for float FFTs on x86 + 'FLOAT_ONLY', + ], + }], + ], }, { # Build all test programs. 'target_name': 'All', 'type': 'none', - 'dependencies': [ - 'test_fft16', - 'test_fft32', - 'test_float_fft', + 'conditions' : [ + ['target_arch == "arm"', { + # Supported test programs for ARM + 'dependencies': [ + 'test_fft16', + 'test_fft32', + 'test_float_fft', + 'test_float_rfft', + 'test_rfft16_s32', + 'test_rfft16_s16', + 'test_rfft32', + # Non-Neon tests + 'test_fft_time_armv7', + 'test_float_fft_armv7', + 'test_float_rfft_armv7', + # Tests with detection + 'test_float_rfft_detect', + ], + }], + ], + 'dependencies' : [ + # All architectures must support at least the float rfft test 'test_float_rfft', - 'test_rfft16', - 'test_rfft32', 'test_fft_time', ], }, diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c new file mode 100644 index 00000000000..b6d1c98279d --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/api/omxSP.h" +#include "dl/sp/api/x86SP.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" +#include <stdbool.h> + +extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace( + const OMX_F32 *src, + OMX_F32 *buf1, + OMX_F32 *buf2, + const OMX_F32 *twiddle, + OMX_INT n, + bool forward_fft); + +extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse( + const OMX_F32 *src, + OMX_F32 *buf1, + OMX_F32 *buf2, + const OMX_F32 *twiddle, + OMX_INT n, + bool forward_fft); + +/** + * A two-for-one algorithm is used here to do the real fft: + * + * Input x[n], (n = 0, ..., N - 1) + * Output X[k] = DFT(N, k){x} + * a[n] = x[2n], (n = 0, ..., N/2 - 1) + * b[n] = x[2n + 1], (n = 0, ..., N/2 - 1) + * z[n] = a[n] + j * b[n] + * Z[k] = DFT(N/2, k){z} + * Z' is the complex conjugate of Z + * A[k] = (Z[k] + Z'[N/2 - k]) / 2 + * B[k] = -j * (Z[k] - Z'[N/2 - k]) / 2 + * X[k] = A[k] + B[k] * W[k], (W = exp(-j*2*PI*k/N); k = 0, ..., N/2 - 1) + * X[k] = A[k] - B[k], (k = N/2) + * X' is complex conjugate of X + * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1) + */ + +/** + * This function is the last permutation of two-for-one FFT algorithm. + * We move the division by 2 to the last step in the implementation, so: + * A[k] = (Z[k] + Z'[N/2 - k]) + * B[k] = -j * (Z[k] - Z'[N/2 - k]) + * X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1) + * X[k] = (A[k] - B[k]), (k = N/2) + * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1) + */ +static void RevbinPermuteFwd( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT i; + OMX_INT j; + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + + OMX_FC32 big_a; + OMX_FC32 big_b; + OMX_FC32 temp; + const OMX_F32 *tw; + + for (i = 1, j = n_by_2 - 1; i < n_by_4; i++, j--) { + // A[k] = (Z[k] + Z'[N/2 - k]) + big_a.Re = in[i] + in[j]; + big_a.Im = in[j + n_by_2] - in[i + n_by_2]; + + // B[k] = -j * (Z[k] - Z'[N/2 - k]) + big_b.Re = in[j] - in[i]; + big_b.Im = in[j + n_by_2] + in[i + n_by_2]; + + // W[k] + tw = twiddle + i; + + // temp = B[k] * W[k] + temp.Re = big_b.Re * tw[0] + big_b.Im * tw[n]; + temp.Im = big_b.Re * tw[n] - big_b.Im * tw[0]; + + // Convert split format to interleaved format. + // X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1) + out[i << 1] = 0.5f * (big_a.Re - temp.Im); + out[(i << 1) + 1] = 0.5f * (temp.Re - big_a.Im); + // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1) + out[j << 1] = 0.5f * (big_a.Re + temp.Im); + out[(j << 1) + 1] = 0.5f * (temp.Re + big_a.Im); + } + + // X[k] = A[k] - B[k] (k = N/2) + out[n_by_2] = in[n_by_4]; + out[n_by_2 + 1] = -in[n_by_4 + n_by_2]; + + out[0] = in[0] + in[n_by_2]; + out[1] = 0; + out[n] = in[0] - in[n_by_2]; + out[n + 1] = 0; +} + +// Sse version of RevbinPermuteFwd function. +static void RevbinPermuteFwdSse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT i; + OMX_INT j; + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + + VC v_i; + VC v_j; + VC v_big_a; + VC v_big_b; + VC v_temp; + VC v_x0; + VC v_x1; + VC v_tw; + + __m128 factor = _mm_set1_ps(0.5f); + + for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { + VC_LOAD_SPLIT(&v_i, (in + i), n_by_2); + + VC_LOADU_SPLIT(&v_j, (in + j), n_by_2); + VC_REVERSE(&v_j); + + // A[k] = (Z[k] + Z'[N/2 - k]) + VC_ADD_SUB(&v_big_a, &v_j, &v_i); + + // B[k] = -j * (Z[k] - Z'[N/2 - k]) + VC_SUB_ADD(&v_big_b, &v_j, &v_i); + + // W[k] + VC_LOAD_SPLIT(&v_tw, (twiddle + i), n); + + // temp = B[k] * W[k] + VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw); + + VC_SUB_X(&v_x0, &v_big_a, &v_temp); + VC_ADD_X(&v_x1, &v_big_a, &v_temp); + + VC_MUL_F(&v_x0, &v_x0, factor); + VC_MUL_F(&v_x1, &v_x1, factor); + + // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1) + VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0); + + // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1) + VC_REVERSE(&v_x1); + VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1); + } + + out[n_by_2] = in[n_by_4]; + out[n_by_2 + 1] = -in[n_by_4 + n_by_2]; + + out[0] = in[0] + in[n_by_2]; + out[1] = 0; + out[n] = in[0] - in[n_by_2]; + out[n + 1] = 0; +} + +OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst, + const OMXFFTSpec_R_F32 *pFFTSpec) { + // Input must be 32 byte aligned + if (!pSrc || !pDst || (OMX_INT)pSrc & 31 || (OMX_INT)pDst & 31) + return OMX_Sts_BadArgErr; + + OMX_INT n; + OMX_INT n_by_2; + OMX_INT n_by_4; + const OMX_F32 *twiddle; + OMX_F32 *buf; + + const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec; + + n = pFFTStruct->N; + + // This is to handle the case of order == 1. + if (n == 2) { + pDst[0] = (pSrc[0] + pSrc[1]); + pDst[1] = 0.0f; + pDst[2] = (pSrc[0] - pSrc[1]); + pDst[3] = 0.0f; + return OMX_Sts_NoErr; + } + + n_by_2 = n >> 1; + n_by_4 = n >> 2; + buf = pFFTStruct->pBuf1; + twiddle = pFFTStruct->pTwiddle; + + if(n_by_2 >= 16) { + buf = x86SP_F32_radix4_kernel_OutOfPlace_sse( + pSrc, + pFFTStruct->pBuf2, + buf, + twiddle, + n_by_2, + 1); + } else { + buf = x86SP_F32_radix2_kernel_OutOfPlace( + pSrc, + pFFTStruct->pBuf2, + buf, + twiddle, + n_by_2, + 1); + } + + if(n >= 8) + RevbinPermuteFwdSse(buf, pDst, twiddle, n); + else + RevbinPermuteFwd(buf, pDst, twiddle, n); + + return OMX_Sts_NoErr; +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c new file mode 100644 index 00000000000..f686a7f2f58 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/api/x86SP.h" +#include "dl/sp/api/omxSP.h" + +/** + * Function: omxSP_FFTGetBufSize_R_F32 + * + * Description: + * Computes the size of the specification structure required for the length + * 2^order real FFT and IFFT functions. + * + * Remarks: + * This function is used in conjunction with the 32-bit functions + * <FFTFwd_RToCCS_F32_Sfs> and <FFTInv_CCSToR_F32_Sfs>. + * + * Parameters: + * [in] order base-2 logarithm of the length; valid in the range + * [1,12]. ([1,15] if BIG_FFT_TABLE is defined.) + * [out] pSize pointer to the number of bytes required for the + * specification structure. + * + * Return Value: + * Standard omxError result. See enumeration for possible result codes. + * + */ + +OMXResult omxSP_FFTGetBufSize_R_F32(OMX_INT order, OMX_INT *pSize) { + if (!pSize || (order < 1) || (order > TWIDDLE_TABLE_ORDER)) + return OMX_Sts_BadArgErr; + + OMX_INT n_by_2; + OMX_INT n; + + n_by_2 = 1 << (order - 1); + n = n_by_2 << 1; + + *pSize = sizeof(X86FFTSpec_R_FC32) + + // Twiddle factors. + sizeof(OMX_F32) * (n << 1) + + // Ping Pong buffer for doing the n/2 point complex FFT. + // pBuf1 + sizeof(OMX_F32) * n + 4 + + // pBuf2 + sizeof(OMX_F32) * n + 4 + + // Extra bytes to get 32 byte alignment of ptwiddle, pBuf1 + 62; + + return OMX_Sts_NoErr; +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInit_R_F32.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInit_R_F32.c new file mode 100644 index 00000000000..564f1666274 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInit_R_F32.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * This is a modification of omxSP_FFTInit_R_S32.c to support float + * instead of S32. + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/api/omxSP.h" +#include "dl/sp/api/x86SP.h" + +/** + * Function: omxSP_FFTInit_R_F32 + * + * Description: + * Initialize the real forward-FFT specification information struct. + * + * Remarks: + * This function is used to initialize the specification structures + * for functions |omxSP_FFTFwd_RToCCS_F32_Sfs| and + * |omxSP_FFTInv_CCSToR_F32_Sfs|. Memory for *pFFTSpec must be + * allocated prior to calling this function. The number of bytes + * required for *pFFTSpec can be determined using + * |omxSP_FFTGetBufSize_R_F32|. + * + * Parameters: + * [in] order base-2 logarithm of the desired block length; + * valid in the range [1,12]. ([1,15] if + * BIG_FFT_TABLE is defined.) + * [out] pFFTFwdSpec pointer to the initialized specification structure. + * + * Return Value: + * Standard omxError result. See enumeration for possible result codes. + * + */ + +OMXResult omxSP_FFTInit_R_F32(OMXFFTSpec_R_F32 *pFFTSpec, OMX_INT order) +{ + OMX_F32 *pTwiddle; + OMX_F32 *pBuf; + OMX_INT i; + OMX_INT j; + OMX_INT N; + OMX_INT NBy2; + OMX_INT NBy4; + OMX_INT diff; + OMX_U32 pTmp; + X86FFTSpec_R_FC32 *pFFTStruct = (X86FFTSpec_R_FC32 *) pFFTSpec; + OMX_F32 real; + OMX_F32 imag; + + if (!pFFTSpec || (order < 1) || (order > TWIDDLE_TABLE_ORDER)) + return OMX_Sts_BadArgErr; + + N = 1 << order; + NBy2 = N >> 1; + + pTwiddle = (OMX_F32*) (sizeof(X86FFTSpec_R_FC32) + (OMX_S8*) pFFTSpec); + + // Align to 32 byte boundary. + pTmp = ((OMX_U32)pTwiddle) & 31; + if (pTmp) + pTwiddle = (OMX_F32*) ((OMX_S8*)pTwiddle + (32 - pTmp)); + + pBuf = (OMX_F32*) (sizeof(OMX_F32) * (N << 1) + (OMX_S8*) pTwiddle); + + // Align to 32 byte boundary. + pTmp = ((OMX_U32)pBuf) & 31; + if (pTmp) + pBuf = (OMX_F32*) ((OMX_S8*)pBuf + (32 - pTmp)); + + // Calculating Twiddle Factors. + diff = 1 << (TWIDDLE_TABLE_ORDER - order + 1); + + // For SSE optimization, using twiddle with split format by which the real and + // imag data are stored into first and last halves of the buffer separately + // The negatives are moved when generating pTwiddle table. + if (order > 1) { + NBy4 = N >> 2; + for (i = 0, j = 0; i <= NBy4 >> 1; ++i, j += diff) { + real = armSP_FFT_F32TwiddleTable[j]; + imag = armSP_FFT_F32TwiddleTable[j + 1]; + + pTwiddle[i] = -real; + pTwiddle[i + N] = -imag; + + pTwiddle[NBy4 - i] = imag; + pTwiddle[NBy4 - i + N] = real; + + pTwiddle[NBy4 + i] = -imag; + pTwiddle[NBy4 + i + N] = real; + + pTwiddle[NBy2 - i] = real; + pTwiddle[NBy2 - i + N] = -imag; + + pTwiddle[NBy2 + i] = real; + pTwiddle[NBy2 + i + N] = imag; + + pTwiddle[NBy4 * 3 - i] = -imag; + pTwiddle[NBy4 * 3 - i + N] = -real; + + pTwiddle[NBy4 * 3 + i] = imag; + pTwiddle[NBy4 * 3 + i + N] = -real; + + pTwiddle[N - i - 1] = -real; + pTwiddle[(N << 1) - i - 1] = imag; + } + } else { + pTwiddle[0] = armSP_FFT_F32TwiddleTable[0]; + pTwiddle[2] = armSP_FFT_F32TwiddleTable[1]; + pTwiddle[1] = -pTwiddle[0]; + pTwiddle[3] = pTwiddle[2]; + } + pFFTStruct->N = N; + pFFTStruct->pTwiddle = pTwiddle; + pFFTStruct->pBuf1 = pBuf; + pFFTStruct->pBuf2 = pBuf + N + 4; + + return OMX_Sts_NoErr; +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c new file mode 100644 index 00000000000..1733d665288 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/api/omxSP.h" +#include "dl/sp/api/x86SP.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" +#include <stdbool.h> + +extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace( + const OMX_F32 *src, + OMX_F32 *buf1, + OMX_F32 *buf2, + const OMX_F32 *twiddle, + OMX_INT n, + bool forward_fft); + +extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse( + const OMX_F32 *src, + OMX_F32 *buf1, + OMX_F32 *buf2, + const OMX_F32 *twiddle, + OMX_INT n, + bool forward_fft); + +/** + * A two-for-one algorithm is used here to do the real ifft: + * + * Input X[k], (k = 0, ..., N - 1) + * Output x[n] = IDFT(N, k){X} + * X' is complex conjugate of X + * A[k] = (X[k] + X'[N/2 - k]) / 2 + * B[k] = (X[k] - X'[N/2 - k]) / 2 * W[k], (W = exp(j*2*PI*k/N); + * k = 0, ..., N/2 - 1) + * Z[k] = A[k] + j * B[k], (k = 0, ..., N/2 - 1) + * z[n] = IDFT(N/2, k){Z} + * x[2n] = Re(z[n]), (n = 0, ..., N/2 - 1) + * x[2n + 1] = Im(z[n]), (n = 0, ..., N/2 - 1) + */ + +/** + * This function is the first permutation of two-for-one IFFT algorithm. + * We move the division by 2 to the last step in the implementation, so: + * A[k] = (X[k] + X'[N/2 - k]) + * B[k] = (X[k] - X'[N/2 - k]) * W[k], (k = 0, ..., N/2 - 1) + * Z[k] = (A[k] + j * B[k]) / 2, (k = 0, ..., N/2 - 1) + */ +static void RevbinPermuteInv(const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT i; + OMX_INT j; + OMX_INT i_by_2; + OMX_INT j_by_2; + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + + OMX_FC32 big_a; + OMX_FC32 big_b; + OMX_FC32 temp; + const OMX_F32 *tw; + + for (i = 2, j = n - 2; i < n_by_2; i += 2, j -= 2) { + // A[k] = (X[k] + X'[N/2 - k]) + big_a.Re = in[i] + in[j]; + big_a.Im = in[i + 1] - in[j + 1]; + + // temp = (X[k] - X'[N/2 - k]) + temp.Re = in[i] - in[j]; + temp.Im = in[i + 1] + in[j + 1]; + + i_by_2 = i >> 1; + j_by_2 = j >> 1; + + // W[k] + tw = twiddle + i_by_2; + + // B[k] = (X[k] - X'[N/2 - k]) * W[k] + big_b.Re = temp.Re * tw[0] + temp.Im * tw[n]; + big_b.Im = temp.Re * tw[n] - temp.Im * tw[0]; + + // Convert split format to interleaved format. + // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1) + // The scaling of 1/2 will be merged into to the scaling in + // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs. + out[i_by_2] = big_a.Re + big_b.Im; + out[i_by_2 + n_by_2] = big_b.Re + big_a.Im; + out[j_by_2] = big_a.Re - big_b.Im; + out[j_by_2 + n_by_2] = big_b.Re - big_a.Im; + } + + // The n_by_2 complex point + out[n_by_4] = 2.0f * in[n_by_2]; + out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1]; + + // The first complex point + out[0] = in[0] + in[n]; + out[n_by_2] = in[0] - in[n]; +} + +// Sse version of RevbinPermuteInv function. +static void RevbinPermuteInvSse(const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT i; + OMX_INT j; + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + const OMX_F32 *tw; + const OMX_F32 *pi; + const OMX_F32 *pj; + + VC v_i; + VC v_j; + VC v_big_a; + VC v_big_b; + VC v_temp; + VC v_tw; + + for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { + pi = in + (i << 1); + pj = in + (j << 1); + VC_LOAD_INTERLEAVE(&v_i, pi); + + v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]); + v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]); + + // A[k] = (X[k] + X'[N/2 - k]) + VC_ADD_SUB(&v_big_a, &v_i, &v_j); + + // temp = (X[k] - X'[N/2 - k]) + VC_SUB_ADD(&v_temp, &v_i, &v_j); + + // W[k] + tw = twiddle + i; + VC_LOAD_SPLIT(&v_tw, tw, n); + + // B[k] = (X[k] - X'[N/2 - k]) * W[k] + VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw); + + // Convert split format to interleaved format. + // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1) + // The scaling of 1/2 will be merged into to the scaling in + // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs. + VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2); + + VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2); + } + + // The n_by_2 complex point + out[n_by_4] = 2.0f * in[n_by_2]; + out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1]; + + // The first complex point + out[0] = in[0] + in[n]; + out[n_by_2] = in[0] - in[n]; +} + +OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst, + const OMXFFTSpec_R_F32 *pFFTSpec) { + // Input must be 32 byte aligned + if (!pSrc || !pDst || (OMX_INT)pSrc & 31 || (OMX_INT)pDst & 31) + return OMX_Sts_BadArgErr; + + OMX_INT n; + OMX_INT n_by_2; + OMX_INT n_by_4; + OMX_INT i; + const OMX_F32 *twiddle; + OMX_F32 *buf; + OMX_F32 *in = (OMX_F32*) pSrc; + + const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec; + + n = pFFTStruct->N; + + // This is to handle the case of order == 1. + if (n == 2) { + pDst[0] = (pSrc[0] + pSrc[2]) / 2; + pDst[1] = (pSrc[0] - pSrc[2]) / 2; + return OMX_Sts_NoErr; + } + + n_by_2 = n >> 1; + n_by_4 = n >> 2; + buf = pFFTStruct->pBuf1; + + twiddle = pFFTStruct->pTwiddle; + + if (n < 8) + RevbinPermuteInv(in, buf, twiddle, n); + else + RevbinPermuteInvSse(in, buf, twiddle, n); + + if (n_by_2 < 16) { + buf = x86SP_F32_radix2_kernel_OutOfPlace( + buf, + pFFTStruct->pBuf2, + buf, + twiddle, + n_by_2, + 0); + } else { + buf = x86SP_F32_radix4_kernel_OutOfPlace_sse( + buf, + pFFTStruct->pBuf2, + buf, + twiddle, + n_by_2, + 0); + } + + // Scale the result by 1/n. + // It contains a scaling factor of 1/2 in + // RevbinPermuteInv/RevbinPermuteInvSse. + OMX_F32 factor = 1.0f / n; + + if (n < 8) { + for (i = 0; i < n_by_2; i++) { + pDst[i << 1] = buf[i] * factor; + pDst[(i << 1) + 1] = buf[i + n_by_2] * factor; + } + } else { + OMX_F32 *base; + OMX_F32 *dst; + VC temp0; + VC temp1; + __m128 mFactor = _mm_load1_ps(&factor); + + // Two things are done in this loop: + // 1 Get the result scaled; 2 Change the format from split to interleaved. + for (i = 0; i < n_by_2; i += 4) { + base = buf + i; + dst = pDst + (i << 1); + VC_LOAD_SPLIT(&temp0, base, n_by_2); + VC_MUL_F(&temp1, &temp0, mFactor); + VC_STORE_INTERLEAVE(dst, &temp1); + } + } + + return OMX_Sts_NoErr; +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c new file mode 100644 index 00000000000..6fa21cfb40d --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Fwd_Radix2_fs( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n) { + OMX_INT i; + OMX_F32 *out0 = out; + + for (i = 0; i < n; i += 2) { + const OMX_F32 *in0 = in + i; + const OMX_F32 *in1 = in0 + n; + OMX_F32 *out1 = out0 + (n >> 1); + + // CADD out0, in0, in1 + out0[0] = in0[0] + in1[0]; + out0[n] = in0[1] + in1[1]; + + // CSUB out1, in0, in1 + out1[0] = in0[0] - in1[0]; + out1[n] = in0[1] - in1[1]; + + out0 += 1; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c new file mode 100644 index 00000000000..f4d991c85c3 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT i; + OMX_F32 *out0 = out; + + for (i = 0; i < n; i += 2) { + OMX_FC32 t; + const OMX_F32 *tw = twiddle + i; + const OMX_F32 *in0 = in + i; + const OMX_F32 *in1 = in0 + 1; + OMX_F32 *out1 = out0 + (n >> 1); + + // CMUL t, tw, in1 + t.Re = tw[0] * in1[0] - tw[n << 1] * in1[n]; + t.Im = tw[0] * in1[n] + tw[n << 1] * in1[0]; + + // CADD out0, in0, t + out0[0] = in0[0] + t.Re; + out0[n] = in0[n] + t.Im; + + // CSUB out1, in0, t + out1[0] = in0[0] - t.Re; + out1[n] = in0[n] - t.Im; + + out0 += 1; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c new file mode 100644 index 00000000000..a712d96e4b3 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" + +void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_F32 *out0 = out; + OMX_INT i; + + // This function is used when n >= 8 + assert(n >= 8); + if (n < 8) return; + + for (i = 0; i < n; i += 8) { + VC v_tw; + VC v_t0; + VC v_t1; + VC v_temp; + + // Load twiddle + const OMX_F32 *tw = twiddle + i; + v_tw.real = _mm_set_ps(tw[6], tw[4], tw[2], tw[0]); + const OMX_F32 * twi = tw + (n << 1); + v_tw.imag = _mm_set_ps(twi[6], twi[4], twi[2], twi[0]); + + // Load real part + const OMX_F32 *t = in + i; + VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t1.real), t); + + // Load imag part + t = t + n; + VC_LOAD_SHUFFLE(&(v_t0.imag), &(v_t1.imag), t); + + OMX_F32 *out1 = out0 + (n >> 1); + VC_MUL(&v_temp, &v_tw, &v_t1); + + VC_SUB_STORE_SPLIT(out1, &v_t0, &v_temp, n); + + VC_ADD_STORE_SPLIT(out0, &v_t0, &v_temp, n); + + out0 += 4; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c new file mode 100644 index 00000000000..37148775e25 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Fwd_Radix2_ms( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num) { + OMX_INT grp; + OMX_F32 *out0 = out; + OMX_INT set_count = sub_num >> 1; + + for (grp = 0; grp < sub_size; ++grp) { + OMX_INT set; + const OMX_F32 *tw = twiddle + grp * sub_num; + + for (set = 0; set < set_count; ++set) { + OMX_FC32 t; + const OMX_F32 *in0 = in + set + grp * sub_num; + const OMX_F32 *in1 = in0 + set_count; + OMX_F32 *out1 = out0 + (n >> 1); + + // CMUL t, tw, in1 + t.Re = tw[0] * in1[0] - tw[n << 1] * in1[n]; + t.Im = tw[0] * in1[n] + tw[n << 1] * in1[0]; + + // CADD out0, in0, t + out0[0] = in0[0] + t.Re; + out0[n] = in0[n] + t.Im; + + // CSUB out1, in0, t + out1[0] = in0[0] - t.Re; + out1[n] = in0[n] - t.Im; + + out0 += 1; + } + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c new file mode 100644 index 00000000000..36a40d8a910 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n) { + OMX_INT i; + OMX_INT n_by_4 = n >> 2; + + // Transform from interleaved format to split format. + for (i = 0; i < n; i++) { + out[i] = in[i << 1]; + out[i + n] = in[(i << 1) + 1]; + } + + // As we have already moved data from [in] to [out], + // next calculation will be produced in in-place mode. + for (i = 0; i < n_by_4; i++) { + OMX_F32 *out0 = out + i; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + OMX_FC32 t0; + OMX_FC32 t1; + OMX_FC32 t2; + OMX_FC32 t3; + + // CADD t0, out0, out2 + t0.Re = out0[0] + out2[0]; + t0.Im = out0[n] + out2[n]; + + // CSUB t1, out0, out2 + t1.Re = out0[0] - out2[0]; + t1.Im = out0[n] - out2[n]; + + // CADD t2, out1, out3 + t2.Re = out1[0] + out3[0]; + t2.Im = out1[n] + out3[n]; + + // CSUB t3, out1, out3 + t3.Re = out1[0] - out3[0]; + t3.Im = out1[n] - out3[n]; + + // CADD out0, t0, t2 + out0[0] = t0.Re + t2.Re; + out0[n] = t0.Im + t2.Im; + + // CSUB out2, t0, t2 + out2[0] = t0.Re - t2.Re; + out2[n] = t0.Im - t2.Im; + + // CADD_SUB_X out1, t1, t3 + out1[0] = t1.Re + t3.Im; + out1[n] = t1.Im - t3.Re; + + // CSUB_ADD_X out3, t1, t3 + out3[0] = t1.Re - t3.Im; + out3[n] = t1.Im + t3.Re; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c new file mode 100644 index 00000000000..58908d3aa2b --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" + +void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n) { + OMX_INT i; + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_2; i += 8) { + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + const OMX_F32 *in0 = in + i; + const OMX_F32 *in1 = in0 + n_by_2; + const OMX_F32 *in2 = in1 + n_by_2; + const OMX_F32 *in3 = in2 + n_by_2; + + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t0.imag), in0); + VC_LOAD_SHUFFLE(&(v_t1.real), &(v_t1.imag), in1); + VC_LOAD_SHUFFLE(&(v_t2.real), &(v_t2.imag), in2); + VC_LOAD_SHUFFLE(&(v_t3.real), &(v_t3.imag), in3); + + RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7, + &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c new file mode 100644 index 00000000000..08ab35bf86a --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + OMX_INT i; + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_2; i += 2) { + OMX_FC32 t0; + OMX_FC32 t1; + OMX_FC32 t2; + OMX_FC32 t3; + OMX_FC32 tt1; + OMX_FC32 tt2; + OMX_FC32 tt3; + const OMX_F32 *tw1 = twiddle + i; + const OMX_F32 *tw2 = tw1 + i; + const OMX_F32 *tw3 = tw2 + i; + const OMX_F32 *in0 = in + (i << 1); + const OMX_F32 *in1 = in0 + 1; + const OMX_F32 *in2 = in1 + 1; + const OMX_F32 *in3 = in2 + 1; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + // CMUL tt1, tw1, in1 + tt1.Re = tw1[0] * in1[0] - tw1[n_mul_2] * in1[n]; + tt1.Im = tw1[0] * in1[n] + tw1[n_mul_2] * in1[0]; + + // CMUL tt2, tw2, in2 + tt2.Re = tw2[0] * in2[0] - tw2[n_mul_2] * in2[n]; + tt2.Im = tw2[0] * in2[n] + tw2[n_mul_2] * in2[0]; + + // CMUL tt3, tw3, in3 + tt3.Re = tw3[0] * in3[0] - tw3[n_mul_2] * in3[n]; + tt3.Im = tw3[0] * in3[n] + tw3[n_mul_2] * in3[0]; + + // CADD t0, in0, tt2 + t0.Re = in0[0] + tt2.Re; + t0.Im = in0[n] + tt2.Im; + + // CSUB t1, in0, tt2 + t1.Re = in0[0] - tt2.Re; + t1.Im = in0[n] - tt2.Im; + + // CADD t2, tt1, tt3 + t2.Re = tt1.Re + tt3.Re; + t2.Im = tt1.Im + tt3.Im; + + // CSUB t3, tt1, tt3 + t3.Re = tt1.Re - tt3.Re; + t3.Im = tt1.Im - tt3.Im; + + // CADD out0, t0, t2 + out0[0] = t0.Re + t2.Re; + out0[n] = t0.Im + t2.Im; + + // CSUB out2, t0, t2 + out2[0] = t0.Re - t2.Re; + out2[n] = t0.Im - t2.Im; + + // CADD_SUB_X out1, t1, t3 + out1[0] = t1.Re + t3.Im; + out1[n] = t1.Im - t3.Re; + + // CSUB_ADD_X out3, t1, t3 + out3[0] = t1.Re - t3.Im; + out3[n] = t1.Im + t3.Re; + + out0 += 1; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c new file mode 100644 index 00000000000..4fc34271809 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" + +void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + OMX_INT i; + + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_2; i += 8) { + const OMX_F32 *tw1 = twiddle + i; + const OMX_F32 *tw2 = tw1 + i; + const OMX_F32 *tw3 = tw2 + i; + const OMX_F32 *in0 = in + (i << 1); + const OMX_F32 *in1 = in0 + 4; + const OMX_F32 *in2 = in1 + 4; + const OMX_F32 *in3 = in2 + 4; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + VC v_tw1; + VC v_tw2; + VC v_tw3; + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + v_tw1.real = _mm_set_ps(tw1[6], tw1[4], tw1[2], tw1[0]); + v_tw1.imag = _mm_set_ps( + tw1[6 + n_mul_2], + tw1[4 + n_mul_2], + tw1[2 + n_mul_2], + tw1[n_mul_2]); + v_tw2.real = _mm_set_ps(tw2[12], tw2[8], tw2[4], tw2[0]); + v_tw2.imag = _mm_set_ps( + tw2[12 + n_mul_2], + tw2[8 + n_mul_2], + tw2[4 + n_mul_2], + tw2[n_mul_2]); + v_tw3.real = _mm_set_ps(tw3[18], tw3[12], tw3[6], tw3[0]); + v_tw3.imag = _mm_set_ps( + tw3[18 + n_mul_2], + tw3[12 + n_mul_2], + tw3[6 + n_mul_2], + tw3[n_mul_2]); + + VC_LOAD_MATRIX_TRANSPOSE(&v_t0, &v_t1, &v_t2, &v_t3, in0, in1, in2, in3, n); + + RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, + &v_tw1, &v_tw2, &v_tw3, + &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c new file mode 100644 index 00000000000..de2a1be7a9b --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num) { + OMX_INT set; + OMX_INT grp; + OMX_INT step = sub_num >> 1; + OMX_INT set_count = sub_num >> 2; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + OMX_F32 *out0 = out; + + // grp == 0 + for (set = 0; set < set_count; ++set) { + OMX_FC32 t0; + OMX_FC32 t1; + OMX_FC32 t2; + OMX_FC32 t3; + + const OMX_F32 *in0 = in + set; + const OMX_F32 *in1 = in0 + set_count; + const OMX_F32 *in2 = in1 + set_count; + const OMX_F32 *in3 = in2 + set_count; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + // CADD t0, in0, in2 + t0.Re = in0[0] + in2[0]; + t0.Im = in0[n] + in2[n]; + + // CSUB t1, in0, in2 + t1.Re = in0[0] - in2[0]; + t1.Im = in0[n] - in2[n]; + + // CADD t2, in1, in3 + t2.Re = in1[0] + in3[0]; + t2.Im = in1[n] + in3[n]; + + // CSUB t3, in1, in3 + t3.Re = in1[0] - in3[0]; + t3.Im = in1[n] - in3[n]; + + // CADD out0, t0, t2 + out0[0] = t0.Re + t2.Re; + out0[n] = t0.Im + t2.Im; + + // CSUB out2, t0, t2 + out2[0] = t0.Re - t2.Re; + out2[n] = t0.Im - t2.Im; + + // CSUB_ADD_X out3, t1, t3 + out3[0] = t1.Re - t3.Im; + out3[n] = t1.Im + t3.Re; + + // CADD_SUB_X out1, t1, t3 + out1[0] = t1.Re + t3.Im; + out1[n] = t1.Im - t3.Re; + + out0 += 1; + } + + // grp > 0 + for (grp = 1; grp < sub_size; ++grp) { + const OMX_F32 *tw1 = twiddle + grp * step; + const OMX_F32 *tw2 = tw1 + grp * step; + const OMX_F32 *tw3 = tw2 + grp * step; + + for (set = 0; set < set_count; ++set) { + OMX_FC32 t0; + OMX_FC32 t1; + OMX_FC32 t2; + OMX_FC32 t3; + OMX_FC32 tt1; + OMX_FC32 tt2; + OMX_FC32 tt3; + + const OMX_F32 *in0 = in + set + grp * sub_num; + const OMX_F32 *in1 = in0 + set_count; + const OMX_F32 *in2 = in1 + set_count; + const OMX_F32 *in3 = in2 + set_count; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + // CMUL tt1, Tw1, in1 + tt1.Re = tw1[0] * in1[0] - tw1[n_mul_2] * in1[n]; + tt1.Im = tw1[0] * in1[n] + tw1[n_mul_2] * in1[0]; + + // CMUL tt2, Tw2, in2 + tt2.Re = tw2[0] * in2[0] - tw2[n_mul_2] * in2[n]; + tt2.Im = tw2[0] * in2[n] + tw2[n_mul_2] * in2[0]; + + // CMUL tt3, Tw3, in3 + tt3.Re = tw3[0] * in3[0] - tw3[n_mul_2] * in3[n]; + tt3.Im = tw3[0] * in3[n] + tw3[n_mul_2] * in3[0]; + + // CADD t0, in0, tt2 + t0.Re = in0[0] + tt2.Re; + t0.Im = in0[n] + tt2.Im; + + // CSUB t1, in0, tt2 + t1.Re = in0[0] - tt2.Re; + t1.Im = in0[n] - tt2.Im; + + // CADD t2, tt1, tt3 + t2.Re = tt1.Re + tt3.Re; + t2.Im = tt1.Im + tt3.Im; + + // CSUB t3, tt1, tt3 + t3.Re = tt1.Re - tt3.Re; + t3.Im = tt1.Im - tt3.Im; + + // CADD out0, t0, t2 + out0[0] = t0.Re + t2.Re; + out0[n] = t0.Im + t2.Im; + + // CSUB out2, t0, t2 + out2[0] = t0.Re - t2.Re; + out2[n] = t0.Im - t2.Im; + + // CADD_SUB_X out1, t1, t3 + out1[0] = t1.Re + t3.Im; + out1[n] = t1.Im - t3.Re; + + // CSUB_ADD_X out3, t1, t3 + out3[0] = t1.Re - t3.Im; + out3[n] = t1.Im + t3.Re; + + out0 += 1; + } + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c new file mode 100644 index 00000000000..286f842c464 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" + +// This function handles the case when set_count = 2, in which we cannot +// unroll the set loop by 4 to meet the SSE requirement (4 elements). +static void InternalUnroll2Fwd( + const OMX_F32 *in, + OMX_F32 *out, + OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT i; + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_2; i += 8) { + const OMX_F32 *tw1 = twiddle + i; + const OMX_F32 *tw2 = tw1 + i; + const OMX_F32 *tw3 = tw2 + i; + const OMX_F32 *tw1e = tw1 + 4; + const OMX_F32 *tw2e = tw2 + 8; + const OMX_F32 *tw3e = tw3 + 12; + + VC v_tw1; + VC v_tw2; + VC v_tw3; + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1), + _mm_load_ss(tw1e), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2), + _mm_load_ss(tw1e + n_mul_2), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2), + _mm_load_ss(tw2e), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2), + _mm_load_ss(tw2e + n_mul_2), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3), + _mm_load_ss(tw3e), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2), + _mm_load_ss(tw3e + n_mul_2), + _MM_SHUFFLE(0, 0, 0, 0)); + + __m128 xmm0; + __m128 xmm1; + __m128 xmm2; + __m128 xmm3; + __m128 xmm4; + __m128 xmm5; + __m128 xmm6; + __m128 xmm7; + + const OMX_F32 *in0 = in + (i << 1); + xmm0 = _mm_load_ps(in0); + xmm1 = _mm_load_ps(in0 + 4); + xmm2 = _mm_load_ps(in0 + 8); + xmm3 = _mm_load_ps(in0 + 12); + v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0)); + v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2)); + v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0)); + v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2)); + + xmm4 = _mm_load_ps(in0 + n); + xmm5 = _mm_load_ps(in0 + n + 4); + xmm6 = _mm_load_ps(in0 + n + 8); + xmm7 = _mm_load_ps(in0 + n + 12); + v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); + v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); + v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); + v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); + + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, + &v_tw1, &v_tw2, + &v_tw3, &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } +} + +void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse( + const OMX_F32 *in, + OMX_F32 *out, + OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num) { + OMX_INT set; + OMX_INT grp; + OMX_INT step = sub_num >> 1; + OMX_INT set_count = sub_num >> 2; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + + OMX_F32 *out0 = out; + + if (set_count == 2) { + InternalUnroll2Fwd(in, out, twiddle, n); + return; + } + + // grp == 0 + for (set = 0; set < set_count; set += 4) { + const OMX_F32 * in0 = in + set; + const OMX_F32 *in1 = in0 + set_count; + const OMX_F32 *in2 = in1 + set_count; + const OMX_F32 *in3 = in2 + set_count; + + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + VC_LOAD_SPLIT(&v_t0, in0, n); + VC_LOAD_SPLIT(&v_t1, in1, n); + VC_LOAD_SPLIT(&v_t2, in2, n); + VC_LOAD_SPLIT(&v_t3, in3, n); + + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7, + &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } + + for (grp = 1; grp < sub_size; ++grp) { + const OMX_F32 *tw1 = twiddle + grp * step; + const OMX_F32 *tw2 = tw1 + grp * step; + const OMX_F32 *tw3 = tw2 + grp * step; + + VC v_tw1; + VC v_tw2; + VC v_tw3; + + v_tw1.real = _mm_load1_ps(tw1); + v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2); + v_tw2.real = _mm_load1_ps(tw2); + v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2); + v_tw3.real = _mm_load1_ps(tw3); + v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2); + + for (set = 0; set < set_count; set += 4) { + const OMX_F32 *in0 = in + set + grp * sub_num; + const OMX_F32 *in1 = in0 + set_count; + const OMX_F32 *in2 = in1 + set_count; + const OMX_F32 *in3 = in2 + set_count; + + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + VC_LOAD_SPLIT(&v_t0, in0, n); + VC_LOAD_SPLIT(&v_t1, in1, n); + VC_LOAD_SPLIT(&v_t2, in2, n); + VC_LOAD_SPLIT(&v_t3, in3, n); + + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, + &v_tw1, &v_tw2, &v_tw3, + &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c new file mode 100644 index 00000000000..9f17d61b757 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Inv_Radix2_fs( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n) { + OMX_INT i; + OMX_INT n_by_2 = n >> 1; + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_2; i++) { + const OMX_F32 *in0 = in + i; + const OMX_F32 *in1 = in0 + n_by_2; + OMX_F32 *out1 = out0 + n_by_2; + + // CADD out0, in0, in1 + out0[0] = in0[0] + in1[0]; + out0[n] = in0[n] + in1[n]; + + // CSUB out1, in0, in1 + out1[0] = in0[0] - in1[0]; + out1[n] = in0[n] - in1[n]; + + out0 += 1; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c new file mode 100644 index 00000000000..ec545c5365a --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Inv_Radix2_ls( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT i; + OMX_F32 *out0 = out; + + for (i = 0; i < n; i += 2) { + OMX_FC32 t; + const OMX_F32 *tw = twiddle + i; + const OMX_F32 *in0 = in + i; + const OMX_F32 *in1 = in0 + 1; + OMX_F32 *out1 = out0 + (n >> 1); + + // CMUL t, tw, in1 + t.Re = tw[0] * in1[0] + tw[n << 1] * in1[n]; + t.Im = tw[0] * in1[n] - tw[n << 1] * in1[0]; + + // CADD out0, in0, t + out0[0] = in0[0] + t.Re; + out0[n] = in0[n] + t.Im; + + // CSUB out1, in0, t + out1[0] = in0[0] - t.Re; + out1[n] = in0[n] - t.Im; + + out0 += 1; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c new file mode 100644 index 00000000000..abad0cc998d --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" + +void x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_F32 *out0 =out; + OMX_INT i; + + for (i = 0; i < n; i += 8) { + VC v_tw; + VC v_t0; + VC v_t1; + VC v_temp; + + // Load twiddle + const OMX_F32 *tw = twiddle + i; + v_tw.real = _mm_set_ps(tw[6], tw[4], tw[2], tw[0]); + const OMX_F32 * twi = tw + (n << 1); + v_tw.imag = _mm_set_ps(twi[6], twi[4], twi[2], twi[0]); + + // Load real part + const OMX_F32 *t = in + i; + VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t1.real), t); + + // Load imag part + t = t + n; + VC_LOAD_SHUFFLE(&(v_t0.imag), &(v_t1.imag), t); + + OMX_F32 *out1 = out0 + (n >> 1); + VC_CONJ_MUL(&v_temp, &v_tw, &v_t1); + + VC_SUB_STORE_SPLIT(out1, &v_t0, &v_temp, n); + + VC_ADD_STORE_SPLIT(out0, &v_t0, &v_temp, n); + + out0 += 4; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c new file mode 100644 index 00000000000..78bc9ebdb61 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Inv_Radix2_ms( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num) { + OMX_INT grp; + OMX_F32 *out0 = out; + OMX_INT set_count = sub_num >> 1; + + for (grp = 0; grp < sub_size; ++grp) { + OMX_INT set; + const OMX_F32 *tw = twiddle + grp * sub_num; + + for (set = 0; set < set_count; ++set) { + OMX_FC32 t; + const OMX_F32 *in0 = in + set + grp * sub_num; + const OMX_F32 *in1 = in0 + set_count; + OMX_F32 *out1 = out0 + (n >> 1); + + // CMUL t, tw, in1 + t.Re = tw[0] * in1[0] + tw[n << 1] * in1[n]; + t.Im = tw[0] * in1[n] - tw[n << 1] * in1[0]; + + // CADD out0, in0, t + out0[0] = in0[0] + t.Re; + out0[n] = in0[n] + t.Im; + + // CSUB out1, in0, t + out1[0] = in0[0] - t.Re; + out1[n] = in0[n] - t.Im; + + out0 += 1; + } + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c new file mode 100644 index 00000000000..bb80fa30830 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Inv_Radix4_fs( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n) { + OMX_INT i; + OMX_INT n_by_4 = n >> 2; + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_4; i++) { + const OMX_F32 *in0 = in + i; + const OMX_F32 *in1 = in0 + n_by_4; + const OMX_F32 *in2 = in1 + n_by_4; + const OMX_F32 *in3 = in2 + n_by_4; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + OMX_FC32 t0; + OMX_FC32 t1; + OMX_FC32 t2; + OMX_FC32 t3; + + // CADD t0, in0, in2 + t0.Re = in0[0] + in2[0]; + t0.Im = in0[n] + in2[n]; + + // CSUB t1, in0, in2 + t1.Re = in0[0] - in2[0]; + t1.Im = in0[n] - in2[n]; + + // CADD t2, in1, in3 + t2.Re = in1[0] + in3[0]; + t2.Im = in1[n] + in3[n]; + + // CSUB t3, in1, in3 + t3.Re = in1[0] - in3[0]; + t3.Im = in1[n] - in3[n]; + + // CADD out0, t0, t2 + out0[0] = t0.Re + t2.Re; + out0[n] = t0.Im + t2.Im; + + // CSUB out2, t0, t2 + out2[0] = t0.Re - t2.Re; + out2[n] = t0.Im - t2.Im; + + // CSUB_ADD_X out1, t1, t3 + out1[0] = t1.Re - t3.Im; + out1[n] = t1.Im + t3.Re; + + // CADD_SUB_X out3, t1, t3 + out3[0] = t1.Re + t3.Im; + out3[n] = t1.Im - t3.Re; + + out0 += 1; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c new file mode 100644 index 00000000000..c3921bc46a4 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" + +void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n) { + OMX_INT i; + OMX_INT n_by_4 = n >> 2; + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_4; i += 4) { + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + const OMX_F32 *in0 = in + i; + const OMX_F32 *in1 = in0 + n_by_4; + const OMX_F32 *in2 = in1 + n_by_4; + const OMX_F32 *in3 = in2 + n_by_4; + + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + VC_LOAD_SPLIT(&v_t0, in0, n); + VC_LOAD_SPLIT(&v_t1, in1, n); + VC_LOAD_SPLIT(&v_t2, in2, n); + VC_LOAD_SPLIT(&v_t3, in3, n); + + RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7, + &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c new file mode 100644 index 00000000000..705d9cbc342 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Inv_Radix4_ls( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + OMX_INT i; + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_2; i += 2) { + OMX_FC32 t0; + OMX_FC32 t1; + OMX_FC32 t2; + OMX_FC32 t3; + OMX_FC32 tt1; + OMX_FC32 tt2; + OMX_FC32 tt3; + const OMX_F32 *tw1 = twiddle + i; + const OMX_F32 *tw2 = tw1 + i; + const OMX_F32 *tw3 = tw2 + i; + const OMX_F32 *in0 = in + (i << 1); + const OMX_F32 *in1 = in0 + 1; + const OMX_F32 *in2 = in1 + 1; + const OMX_F32 *in3 = in2 + 1; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + // CMUL tt1, Tw1, in1 + tt1.Re = tw1[0] * in1[0] + tw1[n_mul_2] * in1[n]; + tt1.Im = tw1[0] * in1[n] - tw1[n_mul_2] * in1[0]; + + // CMUL tt2, Tw2, in2 + tt2.Re = tw2[0] * in2[0] + tw2[n_mul_2] * in2[n]; + tt2.Im = tw2[0] * in2[n] - tw2[n_mul_2] * in2[0]; + + // CMUL tt3, Tw3, in3 + tt3.Re = tw3[0] * in3[0] + tw3[n_mul_2] * in3[n]; + tt3.Im = tw3[0] * in3[n] - tw3[n_mul_2] * in3[0]; + + // CADD t0, in0, tt2 + t0.Re = in0[0] + tt2.Re; + t0.Im = in0[n] + tt2.Im; + + // CSUB t1, in0, tt2 + t1.Re = in0[0] - tt2.Re; + t1.Im = in0[n] - tt2.Im; + + // CADD t2, tt1, tt3 + t2.Re = tt1.Re + tt3.Re; + t2.Im = tt1.Im + tt3.Im; + + // CSUB t3, tt1, tt3 + t3.Re = tt1.Re - tt3.Re; + t3.Im = tt1.Im - tt3.Im; + + // CADD out0, t0, t2 + out0[0] = t0.Re + t2.Re; + out0[n] = t0.Im + t2.Im; + + // CSUB out2, t0, t2 + out2[0] = t0.Re - t2.Re; + out2[n] = t0.Im - t2.Im; + + // CSUB_ADD_X out1, t1, t3 + out1[0] = t1.Re - t3.Im; + out1[n] = t1.Im + t3.Re; + + // CADD_SUB_X out3, t1, t3 + out3[0] = t1.Re + t3.Im; + out3[n] = t1.Im - t3.Re; + + out0 += 1; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c new file mode 100644 index 00000000000..2e245faf1a5 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" + +void x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + OMX_INT i; + + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_2; i += 8) { + const OMX_F32 *tw1 = twiddle + i; + const OMX_F32 *tw2 = tw1 + i; + const OMX_F32 *tw3 = tw2 + i; + const OMX_F32 *in0 = in + (i << 1); + const OMX_F32 *in1 = in0 + 4; + const OMX_F32 *in2 = in1 + 4; + const OMX_F32 *in3 = in2 + 4; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + VC v_tw1; + VC v_tw2; + VC v_tw3; + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + v_tw1.real = _mm_set_ps(tw1[6], tw1[4], tw1[2], tw1[0]); + v_tw1.imag = _mm_set_ps( + tw1[6 + n_mul_2], + tw1[4 + n_mul_2], + tw1[2 + n_mul_2], + tw1[n_mul_2]); + v_tw2.real = _mm_set_ps(tw2[12], tw2[8], tw2[4], tw2[0]); + v_tw2.imag = _mm_set_ps( + tw2[12 + n_mul_2], + tw2[8 + n_mul_2], + tw2[4 + n_mul_2], + tw2[n_mul_2]); + v_tw3.real = _mm_set_ps(tw3[18], tw3[12], tw3[6], tw3[0]); + v_tw3.imag = _mm_set_ps( + tw3[18 + n_mul_2], + tw3[12 + n_mul_2], + tw3[6 + n_mul_2], + tw3[n_mul_2]); + + VC_LOAD_MATRIX_TRANSPOSE(&v_t0, &v_t1, &v_t2, &v_t3, in0, in1, in2, in3, n); + + RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, + &v_tw1, &v_tw2, &v_tw3, + &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c new file mode 100644 index 00000000000..499036b9347 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" + +void x86SP_FFT_CToC_FC32_Inv_Radix4_ms( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num) { + OMX_INT set; + OMX_INT grp; + OMX_INT step = sub_num >> 1; + OMX_INT set_count = sub_num >> 2; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + OMX_F32 *out0 = out; + + // grp == 0 + for (set = 0; set < set_count; ++set) { + OMX_FC32 t0; + OMX_FC32 t1; + OMX_FC32 t2; + OMX_FC32 t3; + + const OMX_F32 *in0 = in + set; + const OMX_F32 *in1 = in0 + set_count; + const OMX_F32 *in2 = in1 + set_count; + const OMX_F32 *in3 = in2 + set_count; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + // CADD t0, in0, in2 + t0.Re = in0[0] + in2[0]; + t0.Im = in0[n] + in2[n]; + + // CSUB t1, in0, in2 + t1.Re = in0[0] - in2[0]; + t1.Im = in0[n] - in2[n]; + + // CADD t2, in1, in3 + t2.Re = in1[0] + in3[0]; + t2.Im = in1[n] + in3[n]; + + // CSUB t3, in1, in3 + t3.Re = in1[0] - in3[0]; + t3.Im = in1[n] - in3[n]; + + // CADD out0, t0, t2 + out0[0] = t0.Re + t2.Re; + out0[n] = t0.Im + t2.Im; + + // CSUB out2, t0, t2 + out2[0] = t0.Re - t2.Re; + out2[n] = t0.Im - t2.Im; + + // CSUB_ADD_X out1, t1, t3 + out1[0] = t1.Re - t3.Im; + out1[n] = t1.Im + t3.Re; + + // CADD_SUB_X out3, t1, t3 + out3[0] = t1.Re + t3.Im; + out3[n] = t1.Im - t3.Re; + + out0 += 1; + } + + // grp > 0 + for (grp = 1; grp < sub_size; ++grp) { + const OMX_F32 *tw1 = twiddle + grp * step; + const OMX_F32 *tw2 = tw1 + grp * step; + const OMX_F32 *tw3 = tw2 + grp * step; + + for (set = 0; set < set_count; ++set) { + OMX_FC32 t0; + OMX_FC32 t1; + OMX_FC32 t2; + OMX_FC32 t3; + OMX_FC32 tt1; + OMX_FC32 tt2; + OMX_FC32 tt3; + + const OMX_F32 *in0 = in + set + grp * sub_num; + const OMX_F32 *in1 = in0 + set_count; + const OMX_F32 *in2 = in1 + set_count; + const OMX_F32 *in3 = in2 + set_count; + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + // CMUL tt1, Tw1, in1 + tt1.Re = tw1[0] * in1[0] + tw1[n_mul_2] * in1[n]; + tt1.Im = tw1[0] * in1[n] - tw1[n_mul_2] * in1[0]; + + // CMUL tt2, Tw2, in2 + tt2.Re = tw2[0] * in2[0] + tw2[n_mul_2] * in2[n]; + tt2.Im = tw2[0] * in2[n] - tw2[n_mul_2] * in2[0]; + + // CMUL tt3, Tw3, in3 + tt3.Re = tw3[0] * in3[0] + tw3[n_mul_2] * in3[n]; + tt3.Im = tw3[0] * in3[n] - tw3[n_mul_2] * in3[0]; + + // CADD t0, in0, tt2 + t0.Re = in0[0] + tt2.Re; + t0.Im = in0[n] + tt2.Im; + + // CSUB t1, in0, tt2 + t1.Re = in0[0] - tt2.Re; + t1.Im = in0[n] - tt2.Im; + + // CADD t2, tt1, tt3 + t2.Re = tt1.Re + tt3.Re; + t2.Im = tt1.Im + tt3.Im; + + // CSUB t3, tt1, tt3 + t3.Re = tt1.Re - tt3.Re; + t3.Im = tt1.Im - tt3.Im; + + // CADD out0, t0, t2 + out0[0] = t0.Re + t2.Re; + out0[n] = t0.Im + t2.Im; + + // CSUB out2, t0, t2 + out2[0] = t0.Re - t2.Re; + out2[n] = t0.Im - t2.Im; + + // CSUB_ADD_X out1, t1, t3 + out1[0] = t1.Re - t3.Im; + out1[n] = t1.Im + t3.Re; + + // CADD_SUB_X out3, t1, t3 + out3[0] = t1.Re + t3.Im; + out3[n] = t1.Im - t3.Re; + + out0 += 1; + } + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c new file mode 100644 index 00000000000..703f316920f --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include "dl/sp/src/x86/x86SP_SSE_Math.h" + +// This function handles the case when set_count = 2, in which we cannot +// unroll the set loop by 4 to meet the SSE requirement (4 elements). +static void InternalUnroll2Inv( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n) { + OMX_INT i; + OMX_INT n_by_2 = n >> 1; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + OMX_F32 *out0 = out; + + for (i = 0; i < n_by_2; i += 8) { + const OMX_F32 *tw1 = twiddle + i; + const OMX_F32 *tw2 = tw1 + i; + const OMX_F32 *tw3 = tw2 + i; + const OMX_F32 *tw1e = tw1 + 4; + const OMX_F32 *tw2e = tw2 + 8; + const OMX_F32 *tw3e = tw3 + 12; + + VC v_tw1; + VC v_tw2; + VC v_tw3; + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1), + _mm_load_ss(tw1e), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2), + _mm_load_ss(tw1e + n_mul_2), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2), + _mm_load_ss(tw2e), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2), + _mm_load_ss(tw2e + n_mul_2), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3), + _mm_load_ss(tw3e), + _MM_SHUFFLE(0, 0, 0, 0)); + v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2), + _mm_load_ss(tw3e + n_mul_2), + _MM_SHUFFLE(0, 0, 0, 0)); + + __m128 xmm0; + __m128 xmm1; + __m128 xmm2; + __m128 xmm3; + __m128 xmm4; + __m128 xmm5; + __m128 xmm6; + __m128 xmm7; + + const OMX_F32 *in0 = in + (i << 1); + xmm0 = _mm_load_ps(in0); + xmm1 = _mm_load_ps(in0 + 4); + xmm2 = _mm_load_ps(in0 + 8); + xmm3 = _mm_load_ps(in0 + 12); + v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0)); + v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2)); + v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0)); + v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2)); + + xmm4 = _mm_load_ps(in0 + n); + xmm5 = _mm_load_ps(in0 + n + 4); + xmm6 = _mm_load_ps(in0 + n + 8); + xmm7 = _mm_load_ps(in0 + n + 12); + v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); + v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); + v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); + v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); + + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, + &v_tw1, &v_tw2, &v_tw3, + &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } +} + +void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num) { + OMX_INT set; + OMX_INT grp; + OMX_INT step = sub_num >> 1; + OMX_INT set_count = sub_num >> 2; + OMX_INT n_by_4 = n >> 2; + OMX_INT n_mul_2 = n << 1; + + OMX_F32 *out0 = out; + + if (set_count == 2) { + InternalUnroll2Inv(in, out, twiddle, n); + return; + } + + // grp == 0 + for (set = 0; set < set_count; set += 4) { + const OMX_F32 * in0 = in + set; + const OMX_F32 *in1 = in0 + set_count; + const OMX_F32 *in2 = in1 + set_count; + const OMX_F32 *in3 = in2 + set_count; + + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + VC_LOAD_SPLIT(&v_t0, in0, n); + VC_LOAD_SPLIT(&v_t1, in1, n); + VC_LOAD_SPLIT(&v_t2, in2, n); + VC_LOAD_SPLIT(&v_t3, in3, n); + + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7, + &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } + + for (grp = 1; grp < sub_size; ++grp) { + const OMX_F32 *tw1 = twiddle + grp * step; + const OMX_F32 *tw2 = tw1 + grp * step; + const OMX_F32 *tw3 = tw2 + grp * step; + + VC v_tw1; + VC v_tw2; + VC v_tw3; + + v_tw1.real = _mm_load1_ps(tw1); + v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2); + v_tw2.real = _mm_load1_ps(tw2); + v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2); + v_tw3.real = _mm_load1_ps(tw3); + v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2); + + for (set = 0; set < set_count; set += 4) { + const OMX_F32 *in0 = in + set + grp * sub_num; + const OMX_F32 *in1 = in0 + set_count; + const OMX_F32 *in2 = in1 + set_count; + const OMX_F32 *in3 = in2 + set_count; + + VC v_t0; + VC v_t1; + VC v_t2; + VC v_t3; + VC v_t4; + VC v_t5; + VC v_t6; + VC v_t7; + + VC_LOAD_SPLIT(&v_t0, in0, n); + VC_LOAD_SPLIT(&v_t1, in1, n); + VC_LOAD_SPLIT(&v_t2, in2, n); + VC_LOAD_SPLIT(&v_t3, in3, n); + + OMX_F32 *out1 = out0 + n_by_4; + OMX_F32 *out2 = out1 + n_by_4; + OMX_F32 *out3 = out2 + n_by_4; + + RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, + &v_tw1, &v_tw2, &v_tw3, + &v_t0, &v_t1, &v_t2, &v_t3); + + RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, + &v_t4, &v_t5, &v_t6, &v_t7, n); + + out0 += 4; + } + } +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c new file mode 100644 index 00000000000..0a3d816ffe4 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include "dl/api/omxtypes.h" +#include <stdbool.h> + +extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_fs( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Inv_Radix2_fs( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_ms( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num); + +extern void x86SP_FFT_CToC_FC32_Inv_Radix2_ms( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num); + +extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Inv_Radix2_ls( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n); + +OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace( + const OMX_F32 *src, + // Two Ping Pong buffers for out of place kernel. + OMX_F32 *buf1, + OMX_F32 *buf2, + const OMX_F32 *twiddle, + OMX_INT n, + bool forward_fft) { + OMX_INT sub_size; + OMX_INT sub_num; + OMX_INT n_by_2 = n >> 1; + OMX_F32 *in = buf1; + OMX_F32 *out = buf2; + + if (forward_fft) + x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(src, in, n); + else + x86SP_FFT_CToC_FC32_Inv_Radix2_fs(src, in, n); + + for (sub_size = 2, sub_num = n_by_2; + sub_size < n_by_2; + sub_size = sub_size << 1, sub_num = sub_num >> 1) { + + if (forward_fft) { + x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(in, out, twiddle, + n, sub_size, sub_num); + } else { + x86SP_FFT_CToC_FC32_Inv_Radix2_ms(in, out, twiddle, + n, sub_size, sub_num); + } + + OMX_F32 *temp = out; + out = in; + in = temp; + } + + // If sub_num <= 1, no need to do the last stage. + if (sub_num <= 1) + return in; + + if (forward_fft) + x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(in, out, twiddle, n); + else + x86SP_FFT_CToC_FC32_Inv_Radix2_ls(in, out, twiddle, n); + + return out; +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c new file mode 100644 index 00000000000..e7c7b892724 --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ +#include "dl/api/omxtypes.h" +#include <stdbool.h> + +extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Inv_Radix4_fs( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse( + const OMX_F32 *in, + OMX_F32 *out, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num); + +extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ms( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num); + +extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num); + +extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n, + OMX_INT sub_size, + OMX_INT sub_num); + +extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ls( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n); + +extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse( + const OMX_F32 *in, + OMX_F32 *out, + const OMX_F32 *twiddle, + OMX_INT n); + +OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace( + const OMX_F32 *src, + OMX_F32 *buf1, + OMX_F32 *buf2, + const OMX_F32 *twiddle, + OMX_INT n, + bool forward_fft) { + OMX_INT sub_size; + OMX_INT sub_num; + OMX_INT n_by_4 = n >> 2; + OMX_F32 *in = buf1; + OMX_F32 *out = buf2; + + if (forward_fft) + x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(src, in, n); + else + x86SP_FFT_CToC_FC32_Inv_Radix4_fs(src, in, n); + + for (sub_size = 4, sub_num = n_by_4; + sub_size < n_by_4; + sub_size = sub_size << 2, sub_num = sub_num >> 2) { + + if (forward_fft) { + x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(in, out, twiddle, + n, sub_size, sub_num); + } else { + x86SP_FFT_CToC_FC32_Inv_Radix4_ms(in, out, twiddle, + n, sub_size, sub_num); + } + + OMX_F32 *temp = out; + out = in; + in = temp; + } + + if (forward_fft) { + if (sub_num == 2) + x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(in, out, twiddle, n); + else + x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(in, out, twiddle, n); + } else { + if (sub_num == 2) + x86SP_FFT_CToC_FC32_Inv_Radix2_ls(in, out, twiddle, n); + else + x86SP_FFT_CToC_FC32_Inv_Radix4_ls(in, out, twiddle, n); + } + + return out; +} + +OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse( + const OMX_F32 *src, + OMX_F32 *buf1, + OMX_F32 *buf2, + const OMX_F32 *twiddle, + OMX_INT n, + // true for forward, false for inverse. + bool forward_fft) { + OMX_INT sub_size, sub_num; + OMX_INT n_by_4 = n >> 2; + OMX_F32 *in, *out; + in = buf1; + out = buf2; + + if (forward_fft) + x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(src, in, n); + else + x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(src, in, n); + + for (sub_size = 4, sub_num = n_by_4; + sub_size < n_by_4; + sub_size = sub_size << 2, sub_num = sub_num >> 2) { + + if (forward_fft) { + x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(in, out, twiddle, + n, sub_size, sub_num); + } else { + x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(in, out, twiddle, + n, sub_size, sub_num); + } + + OMX_F32 *temp = out; + out = in; + in = temp; + } + + // If n is not power of 4, sub_num == 2. + if (forward_fft) { + if (sub_num == 2) + x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse(in, out, twiddle, n); + else + x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(in, out, twiddle, n); + } else { + if (sub_num == 2) + x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse(in, out, twiddle, n); + else + x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(in, out, twiddle, n); + } + + return out; +} diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_SSE_Math.h b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_SSE_Math.h new file mode 100644 index 00000000000..d10a851ae7a --- /dev/null +++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_SSE_Math.h @@ -0,0 +1,488 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights realserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#include <emmintrin.h> +#include <assert.h> + +/** + * Two data formats are used by the FFT routines, internally. The + * interface to the main external FFT routines use interleaved complex + * values where the real part is followed by the imaginary part. + * + * One is the split format where a complex vector of real and imaginary + * values are split such that all of the real values are placed in the + * first half of the vector and the corresponding values are placed in + * the second half, in the same order. The conversion from interleaved + * complex values to split format and back is transparent to the + * external FFT interface. + * + * VComplex uses split format. + */ + +/** VComplex hold 4 complex float elements, with the real parts stored + * in real and corresponding imaginary parts in imag. + */ +typedef struct VComplex { + __m128 real; + __m128 imag; +} VC; + +/* out = a * b */ +static inline void VC_MUL(VC *out, VC *a, VC *b) { + out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real), + _mm_mul_ps(a->imag, b->imag)); + out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag), + _mm_mul_ps(a->imag, b->real)); +} + +/* out = conj(a) * b */ +static inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) { + out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real), + _mm_mul_ps(a->imag, b->imag)); + out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag), + _mm_mul_ps(a->imag, b->real)); +} + +/* Scale complex by a real factor */ +static inline void VC_MUL_F(VC *out, VC *a, __m128 factor) { + out->real = _mm_mul_ps(factor, a->real); + out->imag = _mm_mul_ps(factor, a->imag); +} + +/* out = a + b */ +static inline void VC_ADD(VC *out, VC *a, VC *b) { + out->real = _mm_add_ps(a->real, b->real); + out->imag = _mm_add_ps(a->imag, b->imag); +} + +/** + * out.real = a.real + b.imag + * out.imag = a.imag + b.real + */ +static inline void VC_ADD_X(VC *out, VC *a, VC *b) { + out->real = _mm_add_ps(a->real, b->imag); + out->imag = _mm_add_ps(b->real, a->imag); +} + +/* VC_ADD and store the result with Split format. */ +static inline void VC_ADD_STORE_SPLIT( + OMX_F32 *out, + VC *a, + VC *b, + OMX_INT offset) { + _mm_store_ps(out, _mm_add_ps(a->real, b->real)); + _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag)); +} + +/* out = a - b */ +static inline void VC_SUB(VC *out, VC *a, VC *b) { + out->real = _mm_sub_ps(a->real, b->real); + out->imag = _mm_sub_ps(a->imag, b->imag); +} + +/** + * out.real = a.real - b.imag + * out.imag = a.imag - b.real + */ +static inline void VC_SUB_X(VC *out, VC *a, VC *b) { + out->real = _mm_sub_ps(a->real, b->imag); + out->imag = _mm_sub_ps(b->real, a->imag); +} + +/* VC_SUB and store the result with Split format. */ +static inline void VC_SUB_STORE_SPLIT( + OMX_F32 *out, + VC *a, + VC *b, + OMX_INT offset) { + _mm_store_ps(out, _mm_sub_ps(a->real, b->real)); + _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag)); +} + +/** + * out.real = a.real + b.real + * out.imag = a.imag - b.imag + */ +static inline void VC_ADD_SUB(VC *out, VC *a, VC *b) { + out->real = _mm_add_ps(a->real, b->real); + out->imag = _mm_sub_ps(a->imag, b->imag); +} + +/** + * out.real = a.real + b.imag + * out.imag = a.imag - b.real + */ +static inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) { + out->real = _mm_add_ps(a->real, b->imag); + out->imag = _mm_sub_ps(a->imag, b->real); +} + +/* VC_ADD_SUB_X and store the result with Split format. */ +static inline void VC_ADD_SUB_X_STORE_SPLIT( + OMX_F32 *out, + VC *a, + VC *b, + OMX_INT offset) { + _mm_store_ps(out, _mm_add_ps(a->real, b->imag)); + _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real)); +} + +/** + * out.real = a.real - b.real + * out.imag = a.imag + b.imag + */ +static inline void VC_SUB_ADD(VC *out, VC *a, VC *b) { + out->real = _mm_sub_ps(a->real, b->real); + out->imag = _mm_add_ps(a->imag, b->imag); +} + +/** + * out.real = a.real - b.imag + * out.imag = a.imag + b.real + */ +static inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) { + out->real = _mm_sub_ps(a->real, b->imag); + out->imag = _mm_add_ps(a->imag, b->real); +} + +/* VC_SUB_ADD_X and store the result with Split format. */ +static inline void VC_SUB_ADD_X_STORE_SPLIT( + OMX_F32 *out, + VC *a, VC *b, + OMX_INT offset) { + _mm_store_ps(out, _mm_sub_ps(a->real, b->imag)); + _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real)); +} + +/** + * out[0] = in.real + * out[offset] = in.imag + */ +static inline void VC_STORE_SPLIT( + OMX_F32 *out, + VC *in, + OMX_INT offset) { + _mm_store_ps(out, in->real); + _mm_store_ps(out + offset, in->imag); +} + +/** + * out.real = in[0]; + * out.imag = in[offset]; +*/ +static inline void VC_LOAD_SPLIT( + VC *out, + const OMX_F32 *in, + OMX_INT offset) { + out->real = _mm_load_ps(in); + out->imag = _mm_load_ps(in + offset); +} + +/* Vector Complex Unpack from Split format to Interleaved format. */ +static inline void VC_UNPACK(VC *out, VC *in) { + out->real = _mm_unpacklo_ps(in->real, in->imag); + out->imag = _mm_unpackhi_ps(in->real, in->imag); +} + +/** + * Vector Complex load from interleaved complex array. + * out.real = [in[0].real, in[1].real, in[2].real, in[3].real] + * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag] + */ +static inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) { + __m128 temp0 = _mm_load_ps(in); + __m128 temp1 = _mm_load_ps(in + 4); + out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0)); + out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); +} +/** + * Vector Complex Load with Split format. + * The input address is not 16 byte aligned. + */ +static inline void VC_LOADU_SPLIT( + VC *out, + const OMX_F32 *in, + OMX_INT offset) { + out->real = _mm_loadu_ps(in); + out->imag = _mm_loadu_ps(in + offset); +} + +/* Reverse the order of the Complex Vector. */ +static inline void VC_REVERSE(VC *v) { + v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3)); + v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3)); +} +/* + * Vector Complex store to interleaved complex array + * out[0] = in.real[0] + * out[1] = in.imag[0] + * out[2] = in.real[1] + * out[3] = in.imag[1] + * out[4] = in.real[2] + * out[5] = in.imag[2] + * out[6] = in.real[3] + * out[7] = in.imag[3] + */ +static inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) { + _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag)); + _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag)); +} + +/** + * Vector Complex Store with Interleaved format. + * Address is not 16 byte aligned. + */ +static inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) { + _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag)); + _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag)); +} + +/* VC_ADD_X and store the result with Split format. */ +static inline void VC_ADD_X_STORE_SPLIT( + OMX_F32 *out, + VC *a, VC *b, + OMX_INT offset) { + _mm_store_ps(out, _mm_add_ps(a->real, b->imag)); + _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag)); +} + +/** + * VC_SUB_X and store the result with inverse order. + * Address is not 16 byte aligned. + */ +static inline void VC_SUB_X_INVERSE_STOREU_SPLIT( + OMX_F32 *out, + VC *a, + VC *b, + OMX_INT offset) { + __m128 t; + t = _mm_sub_ps(a->real, b->imag); + _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3))); + t = _mm_sub_ps(b->real, a->imag); + _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3))); +} + +/** + * Vector Complex Load from Interleaved format to Split format. + * Store the result into two __m128 registers. + */ +static inline void VC_LOAD_SHUFFLE( + __m128 *out0, + __m128 *out1, + const OMX_F32 *in) { + VC temp; + VC_LOAD_INTERLEAVE(&temp, in); + *out0 = temp.real; + *out1 = temp.imag; +} + +/* Finish the butterfly calculation of forward radix4 and store the outputs. */ +static inline void RADIX4_FWD_BUTTERFLY_STORE( + OMX_F32 *out0, + OMX_F32 *out1, + OMX_F32 *out2, + OMX_F32 *out3, + VC *t0, + VC *t1, + VC *t2, + VC *t3, + OMX_INT n) { + /* CADD out0, t0, t2 */ + VC_ADD_STORE_SPLIT(out0, t0, t2, n); + + /* CSUB out2, t0, t2 */ + VC_SUB_STORE_SPLIT(out2, t0, t2, n); + + /* CADD_SUB_X out1, t1, t3 */ + VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n); + + /* CSUB_ADD_X out3, t1, t3 */ + VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n); +} + +/* Finish the butterfly calculation of inverse radix4 and store the outputs. */ +static inline void RADIX4_INV_BUTTERFLY_STORE( + OMX_F32 *out0, + OMX_F32 *out1, + OMX_F32 *out2, + OMX_F32 *out3, + VC *t0, + VC *t1, + VC *t2, + VC *t3, + OMX_INT n) { + /* CADD out0, t0, t2 */ + VC_ADD_STORE_SPLIT(out0, t0, t2, n); + + /* CSUB out2, t0, t2 */ + VC_SUB_STORE_SPLIT(out2, t0, t2, n); + + /* CSUB_ADD_X out1, t1, t3 */ + VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n); + + /* CADD_SUB_X out3, t1, t3 */ + VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n); +} + +/* Radix4 forward butterfly */ +static inline void RADIX4_FWD_BUTTERFLY( + VC *t0, + VC *t1, + VC *t2, + VC *t3, + VC *Tw1, + VC *Tw2, + VC *Tw3, + VC *T0, + VC *T1, + VC *T2, + VC *T3) { + VC tt1, tt2, tt3; + + /* CMUL tt1, Tw1, T1 */ + VC_MUL(&tt1, Tw1, T1); + + /* CMUL tt2, Tw2, T2 */ + VC_MUL(&tt2, Tw2, T2); + + /* CMUL tt3, Tw3, T3 */ + VC_MUL(&tt3, Tw3, T3); + + /* CADD t0, T0, tt2 */ + VC_ADD(t0, T0, &tt2); + + /* CSUB t1, T0, tt2 */ + VC_SUB(t1, T0, &tt2); + + /* CADD t2, tt1, tt3 */ + VC_ADD(t2, &tt1, &tt3); + + /* CSUB t3, tt1, tt3 */ + VC_SUB(t3, &tt1, &tt3); +} + +/* Radix4 inverse butterfly */ +static inline void RADIX4_INV_BUTTERFLY( + VC *t0, + VC *t1, + VC *t2, + VC *t3, + VC *Tw1, + VC *Tw2, + VC *Tw3, + VC *T0, + VC *T1, + VC *T2, + VC *T3) { + VC tt1, tt2, tt3; + + /* CMUL tt1, Tw1, T1 */ + VC_CONJ_MUL(&tt1, Tw1, T1); + + /* CMUL tt2, Tw2, T2 */ + VC_CONJ_MUL(&tt2, Tw2, T2); + + /* CMUL tt3, Tw3, T3 */ + VC_CONJ_MUL(&tt3, Tw3, T3); + + /* CADD t0, T0, tt2 */ + VC_ADD(t0, T0, &tt2); + + /* CSUB t1, T0, tt2 */ + VC_SUB(t1, T0, &tt2); + + /* CADD t2, tt1, tt3 */ + VC_ADD(t2, &tt1, &tt3); + + /* CSUB t3, tt1, tt3 */ + VC_SUB(t3, &tt1, &tt3); +} + +/* Radix4 butterfly in first stage for both forward and inverse */ +static inline void RADIX4_BUTTERFLY_FS( + VC *t0, + VC *t1, + VC *t2, + VC *t3, + VC *T0, + VC *T1, + VC *T2, + VC *T3) { + /* CADD t0, T0, T2 */ + VC_ADD(t0, T0, T2); + + /* CSUB t1, T0, T2 */ + VC_SUB(t1, T0, T2); + + /* CADD t2, T1, T3 */ + VC_ADD(t2, T1, T3); + + /* CSUB t3, T1, T3 */ + VC_SUB(t3, T1, T3); +} + +/** + * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix. + * Then Do transpose on the matrix. + * 3, 2, 1, 0 12, 8, 4, 0 + * 7, 6, 5, 4 =====> 13, 9, 5, 1 + * 11, 10, 9, 8 14, 10, 6, 2 + * 15, 14, 13, 12 15, 11, 7, 3 + */ +static inline void VC_LOAD_MATRIX_TRANSPOSE( + VC *T0, + VC *T1, + VC *T2, + VC *T3, + const OMX_F32 *pT0, + const OMX_F32 *pT1, + const OMX_F32 *pT2, + const OMX_F32 *pT3, + OMX_INT n) { + __m128 xmm0; + __m128 xmm1; + __m128 xmm2; + __m128 xmm3; + __m128 xmm4; + __m128 xmm5; + __m128 xmm6; + __m128 xmm7; + + xmm0 = _mm_load_ps(pT0); + xmm1 = _mm_load_ps(pT1); + xmm2 = _mm_load_ps(pT2); + xmm3 = _mm_load_ps(pT3); + + /* Matrix transpose */ + xmm4 = _mm_unpacklo_ps(xmm0, xmm1); + xmm5 = _mm_unpackhi_ps(xmm0, xmm1); + xmm6 = _mm_unpacklo_ps(xmm2, xmm3); + xmm7 = _mm_unpackhi_ps(xmm2, xmm3); + T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); + T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); + T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); + T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); + + xmm0 = _mm_load_ps(pT0 + n); + xmm1 = _mm_load_ps(pT1 + n); + xmm2 = _mm_load_ps(pT2 + n); + xmm3 = _mm_load_ps(pT3 + n); + + /* Matrix transpose */ + xmm4 = _mm_unpacklo_ps(xmm0, xmm1); + xmm5 = _mm_unpackhi_ps(xmm0, xmm1); + xmm6 = _mm_unpacklo_ps(xmm2, xmm3); + xmm7 = _mm_unpackhi_ps(xmm2, xmm3); + T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); + T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); + T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); + T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); +} |