summaryrefslogtreecommitdiff
path: root/chromium/third_party/openmax_dl
diff options
context:
space:
mode:
authorAndras Becsi <andras.becsi@digia.com>2014-03-18 13:16:26 +0100
committerFrederik Gladhorn <frederik.gladhorn@digia.com>2014-03-20 15:55:39 +0100
commit3f0f86b0caed75241fa71c95a5d73bc0164348c5 (patch)
tree92b9fb00f2e9e90b0be2262093876d4f43b6cd13 /chromium/third_party/openmax_dl
parente90d7c4b152c56919d963987e2503f9909a666d2 (diff)
downloadqtwebengine-chromium-3f0f86b0caed75241fa71c95a5d73bc0164348c5.tar.gz
Update to new stable branch 1750
This also includes an updated ninja and chromium dependencies needed on Windows. Change-Id: Icd597d80ed3fa4425933c9f1334c3c2e31291c42 Reviewed-by: Zoltan Arvai <zarvai@inf.u-szeged.hu> Reviewed-by: Zeno Albisser <zeno.albisser@digia.com>
Diffstat (limited to 'chromium/third_party/openmax_dl')
-rw-r--r--chromium/third_party/openmax_dl/dl/api/arm/armCOMM_s.h (renamed from chromium/third_party/openmax_dl/dl/api/armCOMM_s.h)49
-rw-r--r--chromium/third_party/openmax_dl/dl/api/arm/armOMX.h (renamed from chromium/third_party/openmax_dl/dl/api/armOMX.h)0
-rw-r--r--chromium/third_party/openmax_dl/dl/api/arm/omxtypes_s.h (renamed from chromium/third_party/openmax_dl/dl/api/omxtypes_s.h)0
-rw-r--r--chromium/third_party/openmax_dl/dl/dl.gyp231
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/api/armSP.h8
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/api/omxSP.h234
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/api/x86SP.h39
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armSP_FFT_S32TwiddleTable.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_S32TwiddleTable.c)0
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S260
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S145
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S213
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S310
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S386
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S161
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S328
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S227
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S180
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/detect.c85
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S409
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S)12
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S)53
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S)28
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S)16
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_FC32_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC16_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC32_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S639
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_F32_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S301
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_FC32_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC16_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC32_Sfs_s.S (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S)4
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_FC32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC16.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_F32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_F32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16.c77
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S16S32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_FC32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_FC32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC16.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC16.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_F32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_F32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16.c232
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16S32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S16S32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S32.c (renamed from chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S32.c)2
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/test/test_fft.gyp194
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c228
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c60
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInit_R_F32.c126
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c252
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c36
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c43
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c56
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c50
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c72
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c56
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c90
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c81
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c149
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c215
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c37
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c43
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c52
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c50
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c70
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c55
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c90
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c81
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c149
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c215
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c99
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c190
-rw-r--r--chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_SSE_Math.h488
99 files changed, 7826 insertions, 276 deletions
diff --git a/chromium/third_party/openmax_dl/dl/api/armCOMM_s.h b/chromium/third_party/openmax_dl/dl/api/arm/armCOMM_s.h
index 6b0d2be66a2..6ce1e2fc6a3 100644
--- a/chromium/third_party/openmax_dl/dl/api/armCOMM_s.h
+++ b/chromium/third_party/openmax_dl/dl/api/arm/armCOMM_s.h
@@ -371,6 +371,17 @@
.endm
+ @// Allocate 8-byte aligned area of name
+ @// |name| and size |size| bytes.
+ .macro M_ALLOC8 name, size
+ .if (_SBytes & 7) != 0
+ .set _SBytes, _SBytes + (8 - (_SBytes & 7))
+ .endif
+ .set \name\()_F, _SBytes
+ .set _SBytes, _SBytes + \size
+
+ .endm
+
@ Load word from stack
.macro M_LDR r, a0, a1, a2, a3
_M_DATA "ldr", 4, \r, \a0, \a1, \a2, \a3
@@ -381,6 +392,16 @@
_M_DATA "str", 4, \r, \a0, \a1, \a2, \a3
.endm
+ @ Load double word from stack
+ .macro M_LDRD r0, r1, a0, a1, a2, a3
+ _M_DATA2 "ldrd", 8, \r0, \r1, \a0, \a1, \a2, \a3
+ .endm
+
+ @ Store double word to stack
+ .macro M_STRD r0, r1, a0, a1, a2, a3
+ _M_DATA2 "strd", 8, \r0, \r1, \a0, \a1, \a2, \a3
+ .endm
+
@ Macro to perform a data access operation
@ Such as LDR or STR
@ The addressing mode is modified such that
@@ -407,3 +428,31 @@
.set _Offset, _Workspace + \a0\()_F
\i\a1 \r, [sp, #_Offset]
.endm
+
+ @ Macro to perform a data access operation
+ @ Such as LDR or STR
+ @ The addressing mode is modified such that
+ @ 1. If no address is given then the name is taken
+ @ as a stack offset
+ @ 2. If the addressing mode is not available for the
+ @ state being assembled for (eg Thumb) then a suitable
+ @ addressing mode is substituted.
+ @
+ @ On Entry:
+ @ $i = Instruction to perform (eg "LDRB")
+ @ $a = Required byte alignment
+ @ $r = Register(s) to transfer (eg "r1")
+ @ $a0,$a1,$a2. Addressing mode and condition. One of:
+ @ label {,cc}
+ @ [base] {,,,cc}
+ @ [base, offset]{!} {,,cc}
+ @ [base, offset, shift]{!} {,cc}
+ @ [base], offset {,,cc}
+ @ [base], offset, shift {,cc}
+ @
+ @ WARNING: Most of the above are not supported, except the first case.
+ .macro _M_DATA2 i, a, r0, r1, a0, a1, a2, a3
+ .set _Offset, _Workspace + \a0\()_F
+ \i\a1 \r0, \r1, [sp, #_Offset]
+ .endm
+ \ No newline at end of file
diff --git a/chromium/third_party/openmax_dl/dl/api/armOMX.h b/chromium/third_party/openmax_dl/dl/api/arm/armOMX.h
index 0ad21c42ce2..0ad21c42ce2 100644
--- a/chromium/third_party/openmax_dl/dl/api/armOMX.h
+++ b/chromium/third_party/openmax_dl/dl/api/arm/armOMX.h
diff --git a/chromium/third_party/openmax_dl/dl/api/omxtypes_s.h b/chromium/third_party/openmax_dl/dl/api/arm/omxtypes_s.h
index d880d351fd5..d880d351fd5 100644
--- a/chromium/third_party/openmax_dl/dl/api/omxtypes_s.h
+++ b/chromium/third_party/openmax_dl/dl/api/arm/omxtypes_s.h
diff --git a/chromium/third_party/openmax_dl/dl/dl.gyp b/chromium/third_party/openmax_dl/dl/dl.gyp
index 0573ce25631..61a05b007d9 100644
--- a/chromium/third_party/openmax_dl/dl/dl.gyp
+++ b/chromium/third_party/openmax_dl/dl/dl.gyp
@@ -18,79 +18,10 @@
'include_dirs': [
'../',
],
- 'cflags!': [
- '-mfpu=vfpv3-d16',
- ],
- 'cflags': [
- # We enable Neon instructions even with arm_neon==0, to support
- # runtime detection.
- '-mfpu=neon',
- ],
'sources': [
- 'api/armCOMM_s.h',
- 'api/armOMX.h',
'api/omxtypes.h',
- 'api/omxtypes_s.h',
- 'sp/api/armSP.h',
'sp/api/omxSP.h',
- # Complex 32-bit fixed-point FFT.
- 'sp/src/armSP_FFT_S32TwiddleTable.c',
- 'sp/src/omxSP_FFTGetBufSize_C_SC32.c',
- 'sp/src/omxSP_FFTInit_C_SC32.c',
- 'sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S',
- 'sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S',
- 'sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S',
- # Real 32-bit fixed-point FFT
- 'sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S',
- 'sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S',
- 'sp/src/omxSP_FFTGetBufSize_R_S32.c',
- 'sp/src/omxSP_FFTInit_R_S32.c',
- 'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
- # Complex 16-bit fixed-point FFT
- 'sp/src/omxSP_FFTInit_C_SC16.c',
- 'sp/src/omxSP_FFTGetBufSize_C_SC16.c',
- 'sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S',
- 'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
- 'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
- # Real 16-bit fixed-point FFT
- 'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
- 'sp/src/omxSP_FFTGetBufSize_R_S16S32.c',
- 'sp/src/omxSP_FFTInit_R_S16S32.c',
- 'sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S',
- # Complex floating-point FFT
- 'sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S',
- 'sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S',
'sp/src/armSP_FFT_F32TwiddleTable.c',
- 'sp/src/omxSP_FFTGetBufSize_C_FC32.c',
- 'sp/src/omxSP_FFTInit_C_FC32.c',
- 'sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S',
- 'sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S',
- # Real floating-point FFT
- 'sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S',
- 'sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S',
- 'sp/src/omxSP_FFTGetBufSize_R_F32.c',
- 'sp/src/omxSP_FFTInit_R_F32.c',
- 'sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S',
],
'conditions' : [
['big_float_fft == 1', {
@@ -98,6 +29,166 @@
'BIG_FFT_TABLE',
],
}],
+ ['target_arch=="arm"', {
+ 'cflags!': [
+ '-mfpu=vfpv3-d16',
+ ],
+ 'cflags': [
+ # We enable Neon instructions even with arm_neon==0, to support
+ # runtime detection.
+ '-mfpu=neon',
+ ],
+ 'dependencies': [
+ '<(android_ndk_root)/android_tools_ndk.gyp:cpu_features',
+ 'openmax_dl_armv7',
+ ],
+ 'link_settings' : {
+ 'libraries': [
+ # To get the __android_log_print routine
+ '-llog',
+ ],
+ },
+ 'sources': [
+ # Common files that are used by both the NEON and non-NEON code.
+ 'api/armCOMM_s.h',
+ 'api/armOMX.h',
+ 'api/omxtypes_s.h',
+ 'sp/api/armSP.h',
+ 'sp/src/arm/armSP_FFT_S32TwiddleTable.c',
+ 'sp/src/arm/detect.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_R_S16.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c',
+ 'sp/src/arm/omxSP_FFTInit_C_FC32.c',
+ 'sp/src/arm/omxSP_FFTInit_C_SC16.c',
+ 'sp/src/arm/omxSP_FFTInit_C_SC32.c',
+ 'sp/src/arm/omxSP_FFTInit_R_F32.c',
+ 'sp/src/arm/omxSP_FFTInit_R_S16.c',
+ 'sp/src/arm/omxSP_FFTInit_R_S16S32.c',
+ 'sp/src/arm/omxSP_FFTInit_R_S32.c',
+
+ # Complex 32-bit fixed-point FFT.
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S',
+ 'sp/src/arm/neon/omxSP_FFTInv_CToC_SC32_Sfs_s.S',
+ 'sp/src/arm/neon/omxSP_FFTFwd_CToC_SC32_Sfs_s.S',
+ # Real 32-bit fixed-point FFT
+ 'sp/src/arm/neon/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S',
+ 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S',
+ 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
+ # Complex 16-bit fixed-point FFT
+ 'sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S',
+ 'sp/src/arm/neon/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
+ 'sp/src/arm/neon/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
+ # Real 16-bit fixed-point FFT
+ 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S',
+ 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S',
+ 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
+ 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S',
+ # Complex floating-point FFT
+ 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S',
+ 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S',
+ 'sp/src/arm/neon/omxSP_FFTInv_CToC_FC32_Sfs_s.S',
+ 'sp/src/arm/neon/omxSP_FFTFwd_CToC_FC32_Sfs_s.S',
+ # Real floating-point FFT
+ 'sp/src/arm/neon/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S',
+ 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S',
+ 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_F32_Sfs_s.S',
+ ],
+ }],
+ ['target_arch=="ia32" or target_arch=="x64"', {
+ 'cflags': [
+ '-msse2',
+ ],
+ 'sources': [
+ # Real 32-bit floating-point FFT.
+ 'sp/api/x86SP.h',
+ 'sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c',
+ 'sp/src/x86/omxSP_FFTGetBufSize_R_F32.c',
+ 'sp/src/x86/omxSP_FFTInit_R_F32.c',
+ 'sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c',
+ 'sp/src/x86/x86SP_FFT_F32_radix2_kernel.c',
+ 'sp/src/x86/x86SP_FFT_F32_radix4_kernel.c',
+ 'sp/src/x86/x86SP_SSE_Math.h',
+ ],
+ }],
+ ],
+ },
+ ],
+ 'conditions': [
+ ['target_arch=="arm"', {
+ 'targets': [
+ {
+ # Non-NEON implementation of FFT. This library is NOT
+ # standalone. Applications must link with openmax_dl.
+ 'target_name': 'openmax_dl_armv7',
+ 'type': 'static_library',
+ 'include_dirs': [
+ '../',
+ ],
+ 'cflags!': [
+ '-mfpu=neon',
+ ],
+ 'sources': [
+ # Complex floating-point FFT
+ 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S',
+ 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S',
+ 'sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S',
+ 'sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S',
+ 'sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S',
+ # Real floating-point FFT
+ 'sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S',
+ 'sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S',
+ 'sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S',
+ ],
+ },
],
- }]
+ }],
+ ],
}
diff --git a/chromium/third_party/openmax_dl/dl/sp/api/armSP.h b/chromium/third_party/openmax_dl/dl/sp/api/armSP.h
index f615a87c7ab..4972f09c554 100644
--- a/chromium/third_party/openmax_dl/dl/sp/api/armSP.h
+++ b/chromium/third_party/openmax_dl/dl/sp/api/armSP.h
@@ -64,6 +64,14 @@ typedef struct ARMsFFTSpec_R_SC32_Tag
OMX_S32 *pBuf;
}ARMsFFTSpec_R_SC32;
+typedef struct ARMsFFTSpec_R_SC16_Tag
+{
+ OMX_U32 N;
+ OMX_U16 *pBitRev;
+ OMX_SC16 *pTwiddle;
+ OMX_S16 *pBuf;
+} ARMsFFTSpec_R_SC16;
+
typedef struct ARMsFFTSpec_R_FC32_Tag
{
OMX_U32 N;
diff --git a/chromium/third_party/openmax_dl/dl/sp/api/omxSP.h b/chromium/third_party/openmax_dl/dl/sp/api/omxSP.h
index 3016c772f73..5a7980ad452 100644
--- a/chromium/third_party/openmax_dl/dl/sp/api/omxSP.h
+++ b/chromium/third_party/openmax_dl/dl/sp/api/omxSP.h
@@ -44,6 +44,7 @@ extern "C" {
typedef void OMXFFTSpec_C_SC16;
typedef void OMXFFTSpec_C_SC32;
typedef void OMXFFTSpec_R_S16S32;
+ typedef void OMXFFTSpec_R_S16;
typedef void OMXFFTSpec_R_S32;
typedef void OMXFFTSpec_R_F32;
typedef void OMXFFTSpec_C_FC32;
@@ -1423,7 +1424,7 @@ OMXResult omxSP_FFTInit_C_SC32 (
* Input Arguments:
*
* order - base-2 logarithm of the desired block length; valid in the range
- * [0,12]
+ * [1,15]
*
* Output Arguments:
*
@@ -1436,7 +1437,7 @@ OMXResult omxSP_FFTInit_C_SC32 (
* following is true:
* - pFFTSpec is either NULL or violates the 8-byte alignment
* restrictions
- * - order < 0 or order > 12
+ * - order < 1 or order > 15
*
*/
OMXResult omxSP_FFTInit_C_FC32(
@@ -1487,6 +1488,45 @@ OMXResult omxSP_FFTInit_R_S16S32(
/**
+ * Function: omxSP_FFTInit_R_S16
+ *
+ * Description:
+ * These functions initialize specification structures required for the real
+ * FFT and IFFT functions. The function <FFTInit_R_S16> is used
+ * to initialize the specification structures for functions
+ * <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>.
+ *
+ * Memory for *pFFTFwdSpec must be allocated before calling these functions
+ * and should be 8-byte aligned.
+ *
+ * The number of bytes required for *pFFTFwdSpec can be
+ * determined using <FFTGetBufSize_R_S16>.
+ *
+ * Input Arguments:
+ *
+ * order - base-2 logarithm of the desired block length; valid in the range
+ * [1,12]
+ *
+ * Output Arguments:
+ *
+ * pFFTFwdSpec - pointer to the initialized specification structure
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if one or more of the
+ * following is true:
+ * - pFFTFwdSpec is either NULL or violates the 8-byte alignment
+ * restrictions
+ * - order < 1 or order > 12
+ *
+ */
+OMXResult omxSP_FFTInit_R_S16 (
+ OMXFFTSpec_R_S32*pFFTFwdSpec,
+ OMX_INT order
+);
+
+/**
* Function: omxSP_FFTInit_R_S32 (2.2.4.1.4)
*
* Description:
@@ -1543,7 +1583,7 @@ OMXResult omxSP_FFTInit_R_S32 (
* Input Arguments:
*
* order - base-2 logarithm of the desired block length; valid in the range
- * [0,12]
+ * [1,15]
*
* Output Arguments:
*
@@ -1556,7 +1596,7 @@ OMXResult omxSP_FFTInit_R_S32 (
* following is true:
* - pFFTFwdSpec is either NULL or violates the 8-byte alignment
* restrictions
- * - order < 0 or order > 12
+ * - order < 1 or order > 15
*
*/
OMXResult omxSP_FFTInit_R_F32(
@@ -1644,7 +1684,7 @@ OMXResult omxSP_FFTGetBufSize_C_SC32 (
* Input Arguments:
*
* order - base-2 logarithm of the desired block length; valid in the range
- * [0,12]
+ * [1,15]
*
* Output Arguments:
*
@@ -1657,7 +1697,7 @@ OMXResult omxSP_FFTGetBufSize_C_SC32 (
* OMX_Sts_BadArgErr - bad arguments; returned if one or more of the
* following is true:
* - pSize is NULL
- * - order < 0 or order > 12
+ * - order < 1 or order > 15
*
*/
OMXResult omxSP_FFTGetBufSize_C_FC32(
@@ -1699,6 +1739,38 @@ OMXResult omxSP_FFTGetBufSize_R_S16S32(
);
+/**
+ * Function: omxSP_FFTGetBufSize_R_S16
+ *
+ * Description:
+ * These functions compute the size of the specification structure
+ * required for the length 2^order real FFT and IFFT functions. The function
+ * <FFTGetBufSize_R_S16> is used in conjunction with the 16-bit
+ * functions <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>.
+ *
+ * Input Arguments:
+ *
+ * order - base-2 logarithm of the length; valid in the range
+ * [1,12]
+ *
+ * Output Arguments:
+ *
+ * pSize - pointer to the number of bytes required for the specification
+ * structure
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments The function returns
+ * OMX_Sts_BadArgErr if one or more of the following is true:
+ * pSize is NULL
+ * order < 1 or order > 12
+ *
+ */
+OMXResult omxSP_FFTGetBufSize_R_S16 (
+ OMX_INT order,
+ OMX_INT *pSize
+);
/**
* Function: omxSP_FFTGetBufSize_R_S32 (2.2.4.1.8)
@@ -1743,7 +1815,7 @@ OMXResult omxSP_FFTGetBufSize_R_S32 (
*
* Input Arguments:
*
- * order - base-2 logarithm of the length; valid in the range [0,12]
+ * order - base-2 logarithm of the length; valid in the range [1,15]
*
* Output Arguments:
*
@@ -1756,7 +1828,7 @@ OMXResult omxSP_FFTGetBufSize_R_S32 (
* OMX_Sts_BadArgErr - bad arguments The function returns
* OMX_Sts_BadArgErr if one or more of the following is true:
* pSize is NULL
- * order < 0 or order > 12
+ * order < 1 or order > 15
*
*/
OMXResult omxSP_FFTGetBufSize_R_F32(
@@ -1886,8 +1958,7 @@ OMXResult omxSP_FFTFwd_CToC_SC32_Sfs (
* must be aligned on a 32-byte boundary.
* pFFTSpec - pointer to the preallocated and initialized specification
* structure
- * scaleFactor - scale factor of the output. Valid value is 0
- * only.
+ * scaleFactor - scale factor of the output. Valid range is [0,16].
*
* Output Arguments:
* order
@@ -2024,6 +2095,59 @@ OMXResult omxSP_FFTFwd_RToCCS_S16S32_Sfs (
);
+/**
+ * Function: omxSP_FFTFwd_RToCCS_S16_Sfs
+ *
+ * Description:
+ * These functions compute an FFT for a real-valued signal of length of 2^order,
+ * where 0 < order <= 12. Transform length is determined by the
+ * specification structure, which must be initialized prior to calling the FFT
+ * function using the appropriate helper, i.e., <FFTInit_R_S16>.
+ * The relationship between the input and output sequences can
+ * be expressed in terms of the DFT, i.e.:
+ *
+ * x[n] = (2^(-scalefactor)/N) . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ * n=0,1,2,...N-1
+ * N=2^order.
+ *
+ * The conjugate-symmetric output sequence is represented using a CCS vector,
+ * which is of length N+2, and is organized as follows:
+ *
+ * Index: 0 1 2 3 4 5 . . . N-2 N-1 N N+1
+ * Component: R0 0 R1 I1 R2 I2 . . . R[N/2-1] I[N/2-1] R[N/2] 0
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components
+ * for FFT bin 'n'. Bins are numbered from 0 to N/2, where N is the FFT length.
+ * Bin index 0 corresponds to the DC component, and bin index N/2 corresponds to
+ * the foldover frequency.
+ *
+ * Input Arguments:
+ * pSrc - pointer to the real-valued input sequence, of length 2^order;
+ * must be aligned on a 32-byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ * scaleFactor - output scale factor; valid range is [0, 16]
+ *
+ * Output Arguments:
+ * pDst - pointer to output sequence, represented using CCS format, of
+ * length (2^order)+2; must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments, if one or more of followings is true:
+ * - one of the pointers pSrc, pDst, or pFFTSpec is NULL
+ * - pSrc or pDst is not aligned on a 32-byte boundary
+ * - scaleFactor<0 or scaleFactor >16
+ *
+ */
+OMXResult omxSP_FFTFwd_RToCCS_S16_Sfs (
+ const OMX_S16* pSrc,
+ OMX_S16* pDst,
+ const OMXFFTSpec_R_S16* pFFTSpec,
+ OMX_INT scaleFactor
+);
+
/**
* Function: omxSP_FFTFwd_RToCCS_S32_Sfs (2.2.4.4.2)
@@ -2129,7 +2253,29 @@ OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(
const OMXFFTSpec_R_F32* pFFTSpec
);
+#ifdef __arm__
+/*
+ * Non-NEON version of omxSP_FFTFwd_RToCCS_F32_Sfs
+ */
+OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs_vfp(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec
+);
+/*
+ * Just like omxSP_FFTFwd_RToCCS_F32_Sfs, but automatically detects
+ * whether NEON is available or not and chooses the appropriate
+ * routine.
+ */
+extern OMXResult (*omxSP_FFTFwd_RToCCS_F32)(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec
+);
+#else
+#define omxSP_FFTFwd_RToCCS_F32 omxSP_FFTFwd_RToCCS_F32_Sfs
+#endif
/**
* Function: omxSP_FFTInv_CCSToR_S32S16_Sfs (2.2.4.4.4)
@@ -2179,6 +2325,53 @@ OMXResult omxSP_FFTInv_CCSToR_S32S16_Sfs (
);
+/**
+ * Function: omxSP_FFTInv_CCSToR_S16_Sfs
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input
+ * sequence. Transform length is determined by the specification structure,
+ * which must be initialized prior to calling the FFT function using
+ * <FFTInit_R_S16>. For a transform of length M, the input
+ * sequence is represented using a packed CCS vector of length
+ * M+2, and is organized as follows:
+ *
+ * Index: 0 1 2 3 4 5 . . . M-2 M-1 M M+1
+ * Component R[0] 0 R[1] I[1] R[2] I[2] . . . R[M/2-1] I[M/2-1] R[M/2] 0
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components
+ * for FFT bin n.
+ * Bins are numbered from 0 to M/2, where M is the FFT length. Bin index 0
+ * corresponds to the DC component, and bin index M/2 corresponds to the
+ * foldover frequency.
+ *
+ * Input Arguments:
+ * pSrc - pointer to the complex-valued input sequence represented using
+ * CCS format, of length (2^order) + 2; must be aligned on a 32-byte
+ * boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ * scaleFactor - output scalefactor; range is [0,16]
+ *
+ * Output Arguments:
+ * pDst - pointer to the real-valued output sequence, of length 2^order ; must
+ * be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments if one or more of the following is true:
+ * - pSrc, pDst, or pFFTSpec is NULL
+ * - pSrc or pDst is not aligned on a 32-byte boundary
+ * - scaleFactor<0 or scaleFactor >16
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_S16_Sfs (
+ const OMX_S16* pSrc,
+ OMX_S16* pDst,
+ const OMXFFTSpec_R_S16* pFFTSpec,
+ OMX_INT scaleFactor
+);
/**
* Function: omxSP_FFTInv_CCSToR_S32_Sfs (2.2.4.4.4)
@@ -2274,7 +2467,28 @@ OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(
const OMXFFTSpec_R_F32* pFFTSpec
);
+#ifdef __arm__
+/*
+ * Non-NEON version of omxSP_FFTInv_CCSToR_F32_Sfs
+ */
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs_vfp(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec
+);
+/*
+ * Just like omxSP_FFTInv_CCSToR_F32_Sfs, but automatically detects
+ * whether NEON is available or not and chooses the appropriate
+ * routine.
+ */
+extern OMXResult (*omxSP_FFTInv_CCSToR_F32)(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec);
+#else
+#define omxSP_FFTInv_CCSToR_F32 omxSP_FFTInv_CCSToR_F32_Sfs
+#endif
#ifdef __cplusplus
}
diff --git a/chromium/third_party/openmax_dl/dl/sp/api/x86SP.h b/chromium/third_party/openmax_dl/dl/sp/api/x86SP.h
new file mode 100644
index 00000000000..53127343b75
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/api/x86SP.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2007-2008 ARM Limited. All Rights Reserved.
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * It has been relicensed with permission from the copyright holders.
+ */
+
+#ifndef _x86SP_H_
+#define _x86SP_H_
+
+#include "dl/api/omxtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern OMX_F32 armSP_FFT_F32TwiddleTable[];
+
+typedef struct X86FFTSpec_R_FC32_Tag
+{
+ OMX_U32 N;
+ OMX_F32* pTwiddle;
+ // Ping Pong buffer for doing the N/2 point complex FFT.
+ OMX_F32* pBuf1;
+ OMX_F32* pBuf2;
+
+} X86FFTSpec_R_FC32;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_S32TwiddleTable.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/armSP_FFT_S32TwiddleTable.c
index a0db0575b50..a0db0575b50 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_S32TwiddleTable.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armSP_FFT_S32TwiddleTable.c
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
new file mode 100644
index 00000000000..75d6711cd64
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
@@ -0,0 +1,260 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of
+@// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S to support float
+@// instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+@// It implements the "scaled"(by 1/2) version of the above formula.
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@/ IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+#define count r8
+#define diffMinusOne r2
+#define round r3
+
+#define pOut1 r2
+#define size r7
+#define step r3
+#define step1 r6
+#define twStep r12
+#define pTwiddleTmp r14
+#define t0 r12
+
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define w0r s4
+#define w0i s5
+#define y0r s6
+#define y0i s7
+#define w1r s6
+#define w1i s7
+#define y1r s6 /*@// w1r,w1i*/
+#define y1i s7
+#define st0 s8
+#define st1 s9
+#define st2 s10
+#define st3 s11
+#define st4 s12
+#define st5 s13
+//@ half = 0.5
+#define half s15
+
+
+
+
+
+ .MACRO FFTSTAGE scaled, inverse,name
+
+ @// Initialize half now.
+ movw N, #0x0000
+ movt N, #0x3f00
+ vmov.f32 half, N @// half = 0.5
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+
+ MOV size,N,ASR #1 @// preserve the contents of N
+
+ MOV step,size,LSL #3 @// step = N/2 * 8 bytes
+ ADD pTwiddleTmp,pTwiddle,#8 @// W^2
+
+ ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes
+ @// twStep = 3N/8 * 8 bytes pointing to W^1
+ SUB twStep,step,size,LSL #1
+ MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes
+ SUB step1,step1,#8 @// (N/4-1)*8 bytes
+ ADD argTwiddle,pTwiddle,twStep @// W^1
+
+ @// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
+ @// Note: W^(k) is stored as negated value and also need to
+ @// conjugate the values from the table
+
+ @// Z(0) : no need of twiddle multiply
+ @// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
+
+
+ add pSrc, step @// step = N/2*8 bytes
+ vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step]
+ sub pSrc, step
+ vldm.f32 pSrc!, {x0r, x0i}
+
+ SUBS size,size,#2
+
+ vadd.f32 st0, x0r, x1r @// a+c
+ vsub.f32 st1, x0r, x1r @// a-c
+ vmov.f32 x0r, st0
+ vmov.f32 x1r, st1
+ vsub.f32 st0, x0i, x1i @// b-d
+ vadd.f32 x1i, x0i, x1i @// b+d
+ vmov.f32 x0i, st0
+
+
+ vsub.f32 x0r,x0r,x1i @// Z(0).r
+ vadd.f32 x0i,x0i,x1r @// Z(0).i
+
+ vmul.f32 x0r, half
+ vmul.f32 x0i, half
+ vstm.f32 pOut1!, {x0r, x0i} @// pOut1 = pOut+ N/2*8 bytes
+
+ BLT end\name
+ BEQ lastElement\name
+
+ ASR size,size,#1
+evenOddButterflyLoop\name:
+
+ SUB step,step,#16 @// (N/2-2)*8 bytes
+
+ add pSrc, step @// (N/2-1)*8 bytes
+ vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step]
+ sub pSrc, step
+ vldm.f32 pSrc!, {x0r, x0i}
+ add argTwiddle, step1
+ vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step]
+ sub argTwiddle, step1
+ vldm.f32 argTwiddle!, {w0r, w0i}
+
+ SUB step1,step1,#8
+ SUBS size,size,#1
+
+
+ vsub.f32 st2,x0r,x1r @// a-c
+ vadd.f32 st3,x0i,x1i @// b+d
+ vadd.f32 st0,x0r,x1r @// a+c
+ vsub.f32 st1,x0i,x1i @// b-d
+
+ vmul.f32 x1r,w1r,st2
+ vmul.f32 x1i,w1r,st3
+ vmls.f32 x1r,w1i,st3
+ vmla.f32 x1i,w1i,st2
+
+ vadd.f32 y1r,st0,x1i @// F(N/2 -1)
+ vsub.f32 y1i,x1r,st1 @// y1r,y1i same as w1r, w1i
+
+
+ vmul.f32 x0r,w0r,st2
+ vmul.f32 x0i,w0r,st3
+ vmla.f32 x0r,w0i,st3
+ vmls.f32 x0i,w0i,st2
+
+
+ vadd.f32 st4,st0,x0i @// F(1)
+ vsub.f32 st5,st1,x0r
+
+
+ vmul.f32 y1r, half
+ vmul.f32 y1i, half
+ vmul.f32 st4, half
+ vmul.f32 st5, half
+ add pOut1, step @// (N/2-1)*8 bytes
+ vstm.f32 pOut1, {y1r, y1i} @// {y1r,y1i} = [pOut1, step]
+ sub pOut1, step
+ vstm.f32 pOut1!, {st4, st5}
+
+ MOV t0,argTwiddle @// swap ptr for even and odd twiddles
+ MOV argTwiddle,pTwiddleTmp
+ MOV pTwiddleTmp,t0
+
+ BGT evenOddButterflyLoop\name
+
+
+ @// Last element can be expanded as follows
+ @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)]
+ @// (since W^k is stored as -ve)
+ @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+ @// 1/2[2a+j0] + j (c-jd) [0+j2b]
+ @// (a+bc, -bd)
+ @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name:
+ vldm.f32 pSrc, {x0r, x0i}
+
+ vneg.f32 x0i, x0i
+ vstm.f32 pOut1, {x0r, x0i}
+end\name:
+
+
+ .endm
+
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+
+ M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",Inv
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
new file mode 100644
index 00000000000..c2feb0bc758
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
@@ -0,0 +1,145 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define pPingPongBuf r5
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pDstBuf r3 /*@// Temporarily hold pingpong buffer ptr*/
+#define grpSize r14
+#define outPointStep r12
+#define setCount r14
+#define pointStep r12
+
+@// Real and Imaginary parts
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define y1r s4
+#define y1i s5
+#define y0r s6
+#define y0i s7
+
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount
+ @// and pGrpSize regs
+
+ mov subFFTSize, #2
+ lsr grpSize, subFFTNum, #1
+ mov subFFTNum, grpSize
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+ @// Note: outPointStep = pointStep for firststage
+ @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+ MOV pointStep,grpSize,LSL #3
+
+
+
+ @// Loop on the sets for grp zero
+
+grpZeroSetLoop\name:
+
+ add pSrc, pSrc, pointStep
+ @// {x1r,x1i} = [pSrc, pointStep]
+ vldm.f32 pSrc, {x1r, x1i}
+ sub pSrc, pSrc, pointStep
+ vldm.f32 pSrc!, {x0r, x0i}
+
+ SUBS setCount,setCount,#1 @// decrement the loop counter
+
+
+
+ vsub.f32 y1r,x0r,x1r
+ vsub.f32 y1i,x0i,x1i
+
+ vadd.f32 y0r,x0r,x1r
+ vadd.f32 y0i,x0i,x1i
+
+ add pDst, pDst, outPointStep
+ @// {y1r,y1i} -> [pDst, outPointStep]
+ vstm pDst, {y1r, y1i}
+ sub pDst, pDst, outPointStep
+ vstm pDst!, {y0r, y0i}
+
+ BGT grpZeroSetLoop\name
+
+
+ @// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize
+ mov pDst, pPingPongBuf
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+ M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+@/ ENDIF @//ARM1136JS
+
+
+@// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
new file mode 100644
index 00000000000..3bd47252f1e
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
@@ -0,0 +1,213 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define pPingPongBuf r5
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize r14
+#define outPointStep r12
+#define setStep r3
+#define setCount r14 /*@// Reuse grpSize as setCount*/
+#define pointStep r12
+
+@// Real and Imaginary parts
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define x2r s4
+#define x2i s5
+#define x3r s6
+#define x3i s7
+#define t3r s0 /*@// Temporarily hold x3r and x3i*/
+#define t3i s1
+#define sr s8
+#define si s9
+
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @// Define stack arguments
+
+
+ @// Update grpCount and grpSize rightaway inorder to reuse
+ @// pSubFFTSize and pSubFFTNum regs
+ mov subFFTSize, #4
+ lsr grpSize, subFFTNum, #2
+ mov subFFTNum, grpSize
+
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+ @// Note: outPointStep = pointStep for firststage
+ @// Note: setCount = grpSize/4 (reuse the updated grpSize for setCount)
+ MOV pointStep,grpSize,LSL #3
+
+
+ @// Calculate the step of input data for the next set
+ @//MOV setStep,pointStep,LSL #1
+ MOV setStep,grpSize,LSL #4
+ @// setStep = 3*pointStep
+ ADD setStep,setStep,pointStep
+ @// setStep = - 3*pointStep+8
+ RSB setStep,setStep,#8
+
+ @// grp = 0 a special case since all the twiddle factors are 1
+ @// Loop on the sets
+
+grpZeroSetLoop\name:
+
+ vldm.f32 pSrc, {x0r, x0i}
+ add pSrc, pSrc, pointStep
+ vldm.f32 pSrc, {x1r, x1i}
+ add pSrc, pSrc, pointStep
+ vldm.f32 pSrc, {x2r, x2i}
+ add pSrc, pSrc, pointStep
+ vldm.f32 pSrc, {x3r, x3i}
+ add pSrc, pSrc, setStep
+
+
+ @// Decrement setcount
+ SUBS setCount,setCount,#1
+
+
+
+ @// finish first stage of 4 point FFT
+
+ vadd.f32 x0r,x0r,x2r @// x0 = x0 + x2
+ vadd.f32 x0i,x0i,x2i
+
+ vadd.f32 sr, x2r, x2r
+ vadd.f32 si, x2i, x2i
+ vsub.f32 x2r,x0r,sr @// x2 = x0 - x2
+ vsub.f32 x2i,x0i,si
+
+ vadd.f32 x1r,x1r,x3r @// x1 = x1 + x3
+ vadd.f32 x1i,x1i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x1r,sr @// x3 = x1 - x3
+ vsub.f32 x3i,x1i,si
+
+
+ @// finish second stage of 4 point FFT
+
+
+ vadd.f32 x0r,x0r,x1r @// x0 = x0 + x1
+ vadd.f32 x0i,x0i,x1i
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vsub.f32 x1r,x0r,sr @// x1 = x0 - x1
+ vsub.f32 x1i,x0i,si
+
+ vstm.f32 pDst, {x0r, x0i}
+ add pDst, pDst, outPointStep
+
+ vadd.f32 x2r,x2r,x3i
+ vsub.f32 x2i,x2i,x3r
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 t3r, x2r, si
+ vadd.f32 t3i, x2i, sr
+
+ .ifeqs "\inverse", "TRUE"
+ vstm.f32 pDst, {t3r, t3i}
+ add pDst, pDst, outPointStep
+ vstm.f32 pDst, {x1r, x1i}
+ add pDst, pDst, outPointStep
+ vstm.f32 pDst, {x2r, x2i}
+ add pDst, pDst, setStep
+ .else
+ vstm.f32 pDst, {x2r, x2i}
+ add pDst, pDst, outPointStep
+ vstm.f32 pDst, {x1r, x1i}
+ add pDst, pDst, outPointStep
+ vstm.f32 pDst, {t3r, t3i}
+ add pDst, pDst, setStep
+ .endif
+
+
+ BGT grpZeroSetLoop\name
+
+
+ @// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize
+ mov pDst, pPingPongBuf
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+@// ENDIF @//ARM1136JS
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+ .end
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
new file mode 100644
index 00000000000..00e48d1e6ea
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
@@ -0,0 +1,310 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount r12
+#define step r12 /*@// Reuse grpCount*/
+#define outPointStep r3
+#define setCount r8
+#define diff r9
+#define pointStep r14
+
+#define t1 r3 /*@// Reuse outPointStep*/
+
+@// Real and Imaginary parts used in the inner grp loop
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define x2r s4
+#define x2i s5
+#define x3r s6
+#define x3i s7
+
+@// Temporary reg to hold the twiddle multiplies
+
+#define t0r s8
+#define t0i s9
+#define t2r s10
+#define t2i s11
+#define sr s12
+#define si s13
+
+
+
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ @// Define stack arguments
+
+
+ @// Update grpCount and grpSize rightaway inorder to reuse
+ @// pGrpCount and pGrpSize regs
+
+ LSL grpCount,subFFTSize,#2
+ lsr subFFTNum, subFFTNum, #2
+ mov subFFTSize, grpCount
+
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+ mov pointStep, subFFTNum, lsl #1
+
+
+ @// pOut0+1 increments pOut0 by 8 bytes
+ @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size
+ @// bytes
+
+ @// Use setCount as dummy. It's set correctly below.
+ smull outPointStep, setCount, grpCount, pointStep
+
+ LSL pointStep,pointStep,#2 @// 2*grpSize
+
+
+ MOV setCount,pointStep,LSR #3
+
+ @// Interchange grpLoop and setLoop
+
+setLoop\name:
+
+ MOV step,#0
+ @// Set pSrc and pDst for the grpLoop
+
+ SUB diff,outPointStep,pointStep
+
+ @// Save setCount on stack to reuse the reg
+
+ ADD pSrc,pSrc,diff,LSL #2 @// pSrc += (grpCount-1)*grpStep
+ ADD pDst,pDst,diff @// pDst += (grpCount-1)*setCount
+ ADD step,step,diff @// step += (grpCount-1)*setCount
+
+
+
+ @// Loop on the grps
+
+grpLoop\name:
+
+
+
+ @// butterfly loop
+ add pSrc, pointStep
+ vldm.f32 pSrc, {x3r, x3i} @// data[1]
+ add pTwiddle, step
+ vldm.f32 pTwiddle, {x1r, x1i} @// coef[1]
+ add pTwiddle, step
+ vldm.f32 pTwiddle, {x2r, x2i} @// coef[2]
+ add pSrc, pointStep
+ vldm.f32 pSrc, {x0r, x0i} @// data[2]
+
+ @// do first complex multiply
+ vmul.f32 t0r, x3r, x1r
+ vmul.f32 t0i, x3i, x1r
+
+ .ifeqs "\inverse", "TRUE"
+ vmla.f32 t0r, x3i, x1i
+ vmls.f32 t0i, x3r, x1i
+ vmov.f32 x1r, t0r
+ vmov.f32 x1i, t0i
+ .else
+ vmls.f32 t0r, x3i, x1i
+ vmla.f32 t0i, x3r, x1i
+ vmov.f32 x1r, t0r
+ vmov.f32 x1i, t0i
+ .endif
+
+ add pTwiddle, pTwiddle, step
+ vldm pTwiddle, {x3r, x3i} @// coef[3]
+ sub pTwiddle, pTwiddle, step
+
+ @// do second complex multiply
+ vmul.f32 t0r, x0r, x2r
+ vmul.f32 t0i, x0i, x2r
+
+ .ifeqs "\inverse", "TRUE"
+ vmla.f32 t0r, x0i, x2i
+ vmls.f32 t0i, x0r, x2i
+ vmov.f32 x2r, t0r
+ vmov.f32 x2i, t0i
+ .else
+ vmls.f32 t0r, x0i, x2i
+ vmla.f32 t0i, x0r, x2i
+ vmov.f32 x2r, t0r
+ vmov.f32 x2i, t0i
+ .endif
+
+ add pSrc, pointStep
+ vldm pSrc, {x0r, x0i} @// data[3]
+ sub pSrc, pointStep
+
+ SUB pTwiddle,pTwiddle,step,LSL #1 @// reset pTwiddle
+ SUBS step,step,pointStep @// decrement loop counter
+
+ @// do third complex multiply
+ SUB pSrc,pSrc,pointStep,LSL #1 @// reset pSrc to data[0]
+ vmul.f32 t0r, x0r, x3r
+ vmul.f32 t0i, x0i, x3r
+
+ .ifeqs "\inverse", "TRUE"
+ vmla.f32 t0r, x0i, x3i
+ vmls.f32 t0i, x0r, x3i
+ vmov.f32 x3r, t0r
+ vmov.f32 x3i, t0i
+ .else
+ vmls.f32 t0r, x0i, x3i
+ vmla.f32 t0i, x0r, x3i
+ vmov.f32 x3r, t0r
+ vmov.f32 x3i, t0i
+ .endif
+
+ vldm pSrc, {x0r, x0i} @// data[0]
+
+ @// finish first stage of 4 point FFT
+ vadd.f32 x0r,x0r,x2r @// x0 = x0 + x2 (u0)
+ vadd.f32 x0i,x0i,x2i
+
+ vadd.f32 sr, x2r, x2r
+ vadd.f32 si, x2i, x2i
+ vsub.f32 x2r,x0r,sr @// x2 = x0 - x2 (u1)
+ vsub.f32 x2i,x0i,si
+
+ vadd.f32 x1r,x1r,x3r @// x1 = x1/2 + x3/2 (u2/2)
+ vadd.f32 x1i,x1i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x1r,sr @// x3 = x1/2 - x3/2 (u3/2)
+ vsub.f32 x3i,x1i,si
+
+
+ @// finish second stage of 4 point FFT
+
+ @// y0 = u1-u2 since twiddle's are stored as -ve values
+ vsub.f32 x2r,x2r,x1r
+ vsub.f32 x2i,x2i,x1i
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vadd.f32 x1r,x2r,sr @// y2 = u1+u2
+ vadd.f32 x1i,x2i,si
+ vstm pDst, {x2r, x2i} @// store y0
+
+ vsub.f32 x0r,x0r,x3i @// y3 = u0+ju3
+ vadd.f32 x0i,x0i,x3r
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vadd.f32 t2r,x0r,si @// y1 = u0-ju3
+ vsub.f32 t2i,x0i,sr @// t2 will be same as x2r reg
+
+ .ifeqs "\inverse", "TRUE"
+ add pDst, outPointStep
+ vstm pDst, {t2r, t2i} @// store y1
+ add pDst, outPointStep
+ vstm pDst, {x1r, x1i} @// store y2
+ add pDst, outPointStep
+ vstm pDst, {x0r, x0i} @// store y3
+ sub pDst, outPointStep
+ .else
+ add pDst, outPointStep
+ vstm pDst, {x0r, x0i} @// store y1
+ add pDst, outPointStep
+ vstm pDst, {x1r, x1i} @// store y2
+ add pDst, outPointStep
+ vstm pDst, {t2r, t2i} @// store y3
+ sub pDst, outPointStep
+ .endif
+
+ SUB pDst,pDst,outPointStep, LSL #1 @// reset pDst
+ @// update the pDst for the next grp
+ SUBGE pDst,pDst,pointStep
+ @// update the pSrc for the next grp
+ SUBGE pSrc,pSrc,pointStep,LSL #2
+
+
+ BGE grpLoop\name
+
+ ADD pSrc,pSrc,#8 @// pSrc += 1; for the next set
+ ADD pDst,pDst,#8 @// pDst += 1; for the next set
+
+ SUBS setCount,setCount,#1 @// decrement loop counter
+
+
+ BGT setLoop\name
+
+ @// Reset and Swap pSrc and pDst for the next stage
+ MOV t1,pDst
+ SUB pDst,pSrc,subFFTNum,LSL #3
+ SUB pSrc,t1,subFFTNum,LSL #3
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+@// ENDIF @//ARM1136JS
+
+
+
+@// Guarding implementation by the processor name
+
+ .end
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
new file mode 100644
index 00000000000..4ac2da47ac3
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
@@ -0,0 +1,386 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
+#define pPingPongBuf r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize r14
+#define step1 r3
+#define step2 r8
+#define setCount r14 /*@// Reuse grpSize as setCount*/
+#define pointStep r12
+
+#define t0 r4
+@// Real and Imaginary parts
+
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define x2r s4
+#define x2i s5
+#define x3r s6
+#define x3i s7
+#define t3r s8 /*@// Temporarily hold x3r and x3i*/
+#define t3i s9
+#define t1r s4
+#define t1i s5
+#define sr s10
+#define si s11
+#define roothalf s12
+
+@// Define macros to load/store two float regs from/to the stack.
+ .macro M_VSTM r0, r1, p
+ .set _Offset, _Workspace + \p\()_F
+ add t0, sp, #_Offset
+ vstm.f32 t0, {\r0, \r1}
+ .endm
+
+ .macro M_VLDM r0, r1, p
+ .set _Offset, _Workspace + \p\()_F
+ add t0, sp, #_Offset
+ vldm.f32 t0, {\r0, \r1}
+ .endm
+
+@// Define constants
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ @// Define stack arguments
+
+
+ @// Update grpCount and grpSize rightaway inorder to reuse
+ @// pSubFFTSize and pSubFFTNum regs
+
+ mov subFFTSize, #8
+ lsr grpSize, subFFTNum, #3
+ mov subFFTNum, grpSize
+
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+ @// Note: setCount = grpSize/8 (reuse the updated grpSize for
+ @// setCount)
+ MOV pointStep,grpSize,LSL #3
+
+
+ @// Calculate the step of input data for the next set
+ MOV step1,grpSize,LSL #4
+ MOV step2,pointStep,LSL #3
+ SUB step2,step2,pointStep @// step2 = 7*pointStep
+
+
+ @// grp = 0 a special case since all the twiddle factors are 1
+ @// Loop on the sets
+
+ movw t0,#0x04f3
+ movt t0,#0x3f35
+ vmov.f32 roothalf, t0 @// roothalf = sqrt(1/2)
+
+grpZeroSetLoop\name:
+
+ vldm.f32 pSrc, {x0r, x0i} @// x0
+ add pSrc, step1
+ vldm.f32 pSrc, {x1r, x1i} @// x2
+ add pSrc, step1
+ vldm.f32 pSrc, {x2r, x2i} @// x4
+ add pSrc, step1
+ vldm.f32 pSrc, {x3r, x3i} @// x6
+ add pSrc, step1
+
+ SUB pSrc, pSrc, step2
+
+ @// finish first stage of 8 point FFT and save on stack
+
+ vadd.f32 x0r,x0r,x2r @// u0
+ vadd.f32 x0i,x0i,x2i
+
+ vadd.f32 sr, x2r, x2r
+ vadd.f32 si, x2i, x2i
+ vsub.f32 x2r,x0r,sr @// u1
+ vsub.f32 x2i,x0i,si
+
+ M_VSTM x0r,x0i, pU0
+ M_VSTM x2r,x2i, pU1
+
+ vadd.f32 x1r,x1r,x3r @// u4
+ vadd.f32 x1i,x1i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x1r,sr @// u5
+ vsub.f32 x3i,x1i,si
+
+ M_VSTM x1r,x1i, pU4
+ M_VSTM x3r,x3i, pU5
+
+
+ vldm pSrc, {x0r, x0i} @// x1
+ add pSrc, step1
+ vldm pSrc, {x1r, x1i} @// x3
+ add pSrc, step1
+ vldm pSrc, {x2r, x2i} @// x5
+ add pSrc, step1
+ vldm pSrc, {x3r, x3i} @// x7
+ add pSrc, #8
+
+ SUB pSrc, pSrc, step2
+
+ vadd.f32 x0r,x0r,x2r @// u2
+ vadd.f32 x0i,x0i,x2i
+
+ vadd.f32 sr, x2r, x2r
+ vadd.f32 si, x2i, x2i
+ vsub.f32 x2r,x0r,sr @// u3
+ vsub.f32 x2i,x0i,si
+
+ M_VSTM x2r,x2i, pU3
+
+ vadd.f32 x1r,x1r,x3r @// u6
+ vadd.f32 x1i,x1i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x1r,sr @// u7
+ vsub.f32 x3i,x1i,si
+
+ @// finish second and third stage of 8 point FFT
+
+ M_VSTM x3r,x3i, pU7
+ M_VLDM x2r,x2i, pU0
+
+ @// Decrement setcount
+ SUBS setCount,setCount,#1
+ M_VLDM x3r,x3i, pU4
+
+ vadd.f32 x0r,x0r,x1r @// v4
+ vadd.f32 x0i,x0i,x1i
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vsub.f32 x1r,x0r,sr @// v6
+ vsub.f32 x1i,x0i,si
+
+ vadd.f32 x2r,x2r,x3r @// v0
+ vadd.f32 x2i,x2i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x2r,sr @// v2
+ vsub.f32 x3i,x2i,si
+
+
+
+ vadd.f32 x2r,x2r,x0r @// y0
+ vadd.f32 x2i,x2i,x0i
+
+ vadd.f32 sr, x0r, x0r
+ vadd.f32 si, x0i, x0i
+ vsub.f32 x0r,x2r,sr @// y4
+ vsub.f32 x0i,x2i,si
+
+ vstm pDst, {x2r, x2i} @// store y0
+ add pDst, step1
+
+ vadd.f32 x3r,x3r,x1i @// y6
+ vsub.f32 x3i,x3i,x1r
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vsub.f32 t1r,x3r,si @// t1r=x2r reg;t1i=x2i reg
+ vadd.f32 t1i,x3i,sr @// y2
+
+ .ifeqs "\inverse", "TRUE"
+ vstm pDst, {t1r, t1i} @// store y2
+ add pDst, step1
+ vstm pDst, {x0r, x0i} @// store y4
+ add pDst, step1
+ vstm pDst, {x3r, x3i} @// store y6
+ add pDst, step1
+ .else
+ vstm pDst, {x3r, x3i} @// store y2
+ add pDst, step1
+ vstm pDst, {x0r, x0i} @// store y4
+ add pDst, step1
+ vstm pDst, {t1r, t1i} @// store y6
+ add pDst, step1
+ .endif
+
+ SUB pDst, pDst, step2 @// set pDst to y1
+
+
+ M_VLDM x0r,x0i,pU1 @// Load u1,u3,u5,u7
+ M_VLDM x1r,x1i,pU5
+ M_VLDM x3r,x3i,pU7
+
+ vsub.f32 x0r,x0r,x1i @// v1
+ vadd.f32 x0i,x0i,x1r
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vadd.f32 t1r,x0r,si @// t1r=x2r reg;t1i=x2i reg
+ vsub.f32 t1i,x0i,sr @// v3
+
+ M_VLDM x1r,x1i,pU3
+
+ vsub.f32 x1r,x1r,x3i @// v5
+ vadd.f32 x1i,x1i,x3r
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vadd.f32 t3r,x1r,si @// t3i = x3i
+ vsub.f32 t3i,x1i,sr @// v7
+
+ @// store v5 as (v5.r - v5.i,v5.r + v5.i)
+ @// store v7 as (v7.i + v7.r,v7.i - v7.r)
+
+ vadd.f32 x3r,t3i,t3r @// v7
+ vsub.f32 x3i,t3i,t3r
+
+ vsub.f32 x1r,x1r,x1i @// v5
+ vadd.f32 x1i, x1i
+ vadd.f32 x1i,x1r,x1i
+
+ vmul.f32 x3r, x3r, roothalf @// (v7.i + v7.r)*(1/sqrt(2))
+ vmul.f32 x3i, x3i, roothalf @// (v7.i - v7.r)*(1/sqrt(2))
+ vmul.f32 x1r, x1r, roothalf @// (v5.r - v5.i)*(1/sqrt(2))
+ vmul.f32 x1i, x1i, roothalf @// (v5.r + v5.i)*(1/sqrt(2))
+
+ vadd.f32 x2r,x2r,x3r @// y7
+ vadd.f32 x2i,x2i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x2r,sr @// y3
+ vsub.f32 x3i,x2i,si
+
+
+ vsub.f32 x0r,x0r,x1r @// y5
+ vsub.f32 x0i,x0i,x1i
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vadd.f32 x1r,x0r,sr @// y1
+ vadd.f32 x1i,x0i,si
+
+ .ifeqs "\inverse", "TRUE"
+ vstm pDst, {x1r, x1i} @// store y1
+ add pDst, step1
+ vstm pDst, {x3r, x3i} @// store y3
+ add pDst, step1
+ vstm pDst, {x0r, x0i} @// store y5
+ add pDst, step1
+ vstm pDst, {x2r, x2i} @// store y7
+ add pDst, #8
+ .else
+ vstm pDst, {x2r, x2i} @// store y1
+ add pDst, step1
+ vstm pDst, {x0r, x0i} @// store y3
+ add pDst, step1
+ vstm pDst, {x3r, x3i} @// store y5
+ add pDst, step1
+ vstm pDst, {x1r, x1i} @// store y7
+ add pDst, #8
+ .endif
+
+ SUB pDst, pDst, step2 @// update pDst for the next set
+
+
+ BGT grpZeroSetLoop\name
+
+
+ @// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize
+ mov pDst, pPingPongBuf
+
+
+ .ENDM
+
+
+
+
+
+ @// Allocate stack memory required by the function
+
+ @// Ensure 8 byte alignment to use M_VLDM
+ M_ALLOC8 pU0, 8
+ M_ALLOC8 pU1, 8
+ M_ALLOC8 pU3, 8
+ M_ALLOC8 pU4, 8
+ M_ALLOC8 pU5, 8
+ M_ALLOC8 pU7, 8
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+ @// Allocate stack memory required by the function
+
+ @// Ensure 8 byte alignment to use M_VLDM
+ M_ALLOC8 pU0, 8
+ M_ALLOC8 pU1, 8
+ M_ALLOC8 pU3, 8
+ M_ALLOC8 pU4, 8
+ M_ALLOC8 pU5, 8
+ M_ALLOC8 pU7, 8
+
+ M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+
+@// Guarding implementation by the processor name
+
+
+ .END
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
new file mode 100644
index 00000000000..25b4976ca80
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
@@ -0,0 +1,161 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of omxSP_FFTFwd_CToC_SC32_Sfs_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@/ IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+#define count r8
+#define diffMinusOne r2
+#define round r3
+
+#define x0r s0
+#define x0i s1
+
+
+
+
+ @// Allocate stack memory required by the function
+
+ @// Write function header
+ M_START omxSP_FFTFwd_CToC_FC32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+ @//MOV subFFTNum,N
+
+
+ CMP order,#1
+ BGT orderGreaterthan1 @// order > 1
+ @// order = 0, 1
+ vldmlt.f32 pSrc, {x0r, x0i}
+ vstmlt.f32 pDst, {x0r, x0i}
+
+ MOVLT pSrc,pDst
+ BLT End
+
+ @// Handle order = 1
+ MOV argDst,pDst @// Set input args to fft stages
+ MOV argTwiddle,pTwiddle
+ BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ B End
+
+
+
+orderGreaterthan1:
+
+ TST order, #2 @// Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ @//check for even or odd order
+
+ @// NOTE: The following combination of BL's would work fine
+ @// eventhough the first BL would corrupt the flags. This is
+ @// because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ @// sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+ CMP subFFTNum,#1
+ BEQ End
+ BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ B unscaledRadix4Loop
+
+
+End:
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @// Write function tail
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
new file mode 100644
index 00000000000..dd1690ad10b
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
@@ -0,0 +1,328 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute FFT for a real signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+@// N=1 case
+#define scaleMinusOne r2
+#define rnd r2
+#define zero r8
+#define Zero r9
+
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+#define count r8
+#define diffMinusOne r10
+#define round r3
+
+#define step r3
+#define step1 r6
+#define twStep r12
+#define pTwiddleTmp r14
+#define t0 r12
+#define t1 r14 /*@// pTwiddleTmp*/
+#define t2 r0
+#define t3 r1 /*@// pSrc,argTwiddle*/
+#define t4 r6
+#define t5 r7 /*@// step1,subFFTSize*/
+
+#define x0r s0
+#define x0i s1
+#define y0r s2
+#define y0i s3
+#define x1r s4
+#define x1i s5
+#define w1r s2
+#define w1i s3
+#define w0r s6
+#define w0i s7
+#define y1r s2 /*@// w1r,w1i*/
+#define y1i s3
+#define st0 s8
+#define st1 s9
+#define st2 s10
+#define st3 s11
+#define st4 s12
+#define st5 s13
+#define half s15
+
+
+
+
+ @// Allocate stack memory required by the function
+
+
+
+ @// Write function header
+ M_START omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Setup half value
+ movw N, #0 @// Use N as a temp.
+ movt N, #0x3f00
+ vmov.f32 half, N
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ @// N=1 Treat seperately
+ CMP N,#1
+ BGT sizeGreaterThanOne
+ // N<=1 is not supported
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+ B FunctionEnd
+
+sizeGreaterThanOne:
+ @// Do a N/2 point complex FFT including the scaling
+
+ MOV N,N,ASR #1 @// N/2 point complex FFT
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+ @//MOV subFFTNum,N
+
+
+ CMP order,#1
+ BGT orderGreaterthan1 @// order > 1
+ vldmlt.f32 pSrc, {x0r, x0i}
+ vstmlt.f32 pOut, {x0r, x0i}
+ MOVLT pSrc,pOut
+ MOVLT argDst,pDst
+ BLT FFTEnd
+
+ MOV argDst,pOut @// Set input args to fft stages
+ MOV pOut,pDst @// Set input args to fft stages
+ MOV argTwiddle,pTwiddle
+
+ BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ B finalComplexToRealFixup
+
+orderGreaterthan1:
+
+ TST order, #2 @// Set input args to fft stages
+ MOVEQ argDst,pDst
+ MOVNE argDst,pOut
+ MOVNE pOut,pDst @// Pass the first stage dest in RN5
+ MOV argTwiddle,pTwiddle
+
+ @//check for even or odd order
+
+ @// NOTE: The following combination of BL's would work fine
+ @// eventhough the first BL would corrupt the flags. This is
+ @// because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
+ @// the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+ CMP subFFTNum,#1
+ BEQ FFTEnd
+ BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ B unscaledRadix4Loop
+
+FFTEnd:
+finalComplexToRealFixup:
+
+ @// step = N/2 * 8 bytes
+ MOV step,subFFTSize,LSL #3
+ @// twStep = 3N/8 * 8 bytes pointing to W^1
+ SUB twStep,step,subFFTSize,LSL #1
+ @// step1 = N/4 * 8 = N/2*4 bytes
+ MOV step1,subFFTSize,LSL #2
+ @// (N/4-1)*8 bytes
+ SUB step1,step1,#8
+
+ @// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+ @// 1/2 [(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
+ @// 1/2 [2a+j0] - j [0+j2b]
+ @// (a+b, 0)
+
+ @// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+ @// 1/2 [(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
+ @// 1/2 [2a+j0] + j [0+j2b]
+ @// (a-b, 0)
+
+ @// F(0) and F(N/2)
+ vldm.f32 pSrc!, {x0r, x0i}
+ vadd.f32 y0r,x0r,x0i @// F(0) = (2(Z0.r+Z0.i) , 0)
+ vsub.f32 x0r,x0r,x0i @// F(N/2) = (2(Z0.r-Z0.i) , 0)
+ vsub.f32 y0i, y0i @ y0i and x0i set to 0.0
+ vsub.f32 x0i, x0i
+
+ add argDst, step
+ vstm.f32 argDst, {x0r, x0i} @// {x0r,x0i}->[argDst, step]
+ sub argDst, step
+ vstm.f32 argDst!, {y0r, y0i}
+
+ SUBS subFFTSize,subFFTSize,#2
+
+ ADD pTwiddleTmp,argTwiddle,#8 @// W^2
+ ADD argTwiddle,argTwiddle,twStep @// W^1
+ BLT End
+ BEQ lastElement
+
+
+ @// F(k) = 1/2 [Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
+ @// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since
+ @// both of them require Z(1) and Z(N/2-1)
+
+ ASR subFFTSize,subFFTSize,#1
+evenOddButterflyLoop:
+
+ SUB step,step,#16 @// (N/2-2)*8 bytes
+
+ add pSrc, step
+ vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step]
+ sub pSrc, step
+ vldm.f32 pSrc!, {x0r, x0i}
+ add argTwiddle, step1
+ vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step1]
+ sub argTwiddle, step1
+ vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8
+
+ SUB step1,step1,#8
+ SUBS subFFTSize,subFFTSize,#1
+
+ vsub.f32 st2,x0r,x1r @// a-c
+ vadd.f32 st3,x0i,x1i @// b+d
+ vadd.f32 st0,x0r,x1r @// a+c
+ vsub.f32 st1,x0i,x1i @// b-d
+
+ vmul.f32 x1r,w1r,st2
+ vmul.f32 x1i,w1r,st3
+ vmla.f32 x1r,w1i,st3 @// x1r = w1r*st2 + w1i*st3
+ @//RSB x1r,x1r,#0
+ vmls.f32 x1i,w1i,st2 @// x1i = w1r*st3 - wli*st2
+
+ vsub.f32 y1r, st0, x1i
+ vadd.f32 y1i, x1r, st1
+ vneg.f32 y1i, y1i
+
+ vmul.f32 x0r,w0r,st2
+ vmul.f32 x0i,w0r,st3
+ vmls.f32 x0r,w0i,st3 @// x0r = w0r*st2 - w0i*st3
+ vmla.f32 x0i,w0i,st2 @// x0i = w0r*st3 + x0i*st1
+
+ vsub.f32 st4,st0,x0i @// F(1)
+ vadd.f32 st5,x0r,st1
+
+
+ vmul.f32 y1r, half
+ vmul.f32 y1i, half
+ vmul.f32 st4, half
+ vmul.f32 st5, half
+
+ add argDst, step
+ vstm.f32 argDst, {y1r, y1i} @// {y1r,y1i} -> [argDst,step]
+ sub argDst, step
+ vstm.f32 argDst!, {st4, st5}
+
+
+ MOV t0,argTwiddle @// swap ptr for even and odd twiddles
+ MOV argTwiddle,pTwiddleTmp
+ MOV pTwiddleTmp,t0
+
+ BGT evenOddButterflyLoop
+
+ @// Last element can be expanded as follows
+ @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+ @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+ @// 1/2[2a+j0] + j (c+jd) [0+j2b]
+ @// (a-bc, -bd)
+
+lastElement:
+ vldm.f32 pSrc, {x0r, x0i}
+ vneg.f32 x0i, x0i
+ vstm.f32 argDst, {x0r, x0i}
+
+End:
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+FunctionEnd:
+ @// Write function tail
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
new file mode 100644
index 00000000000..d6a47652738
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
@@ -0,0 +1,227 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of omxSP_FFTInv_CCSToR_S32_Sfs_s.s
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@// Total num of radix stages required to comple the FFT*/
+#define count r8
+
+#define round r3
+
+#define x0r s0
+#define x0i s1
+#define y0r s2
+#define y0i s3
+#define x1r s4
+#define x1i s5
+#define w1r s2
+#define w1i s3
+#define w0r s6
+#define w0i s7
+#define y1r s2 /*@// w1r,w1i*/
+#define y1i s3
+#define st0 s8
+#define st1 s9
+#define st2 s10
+#define st3 s11
+#define st4 s12
+#define st5 s13
+#define fscale s2
+#define fone s3
+
+
+
+ @// Allocate stack memory required by the function
+ M_ALLOC4 pDstOnStack, 4
+ M_ALLOC4 pFFTSpecOnStack, 4
+
+ @// Write function header
+ M_START omxSP_FFTInv_CCSToR_F32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+
+
+ @// N=1 Treat seperately
+ CMP N,#1
+ BGT sizeGreaterThanOne
+ vldr.f32 x0r, [pSrc]
+ vstr.f32 x0r, [pDst]
+
+ B End
+
+sizeGreaterThanOne:
+ M_STR pDst,pDstOnStack @// store all the pointers
+ M_STR pFFTSpec,pFFTSpecOnStack
+
+
+ @// Call the preTwiddle Radix2 stage before doing the compledIFFT
+
+ BL armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp
+
+
+complexIFFT:
+
+ M_LDR pFFTSpec,pFFTSpecOnStack
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ ASR N,N,#1 @// N/2 point complex IFFT
+ ADD pSrc,pOut,N,LSL #3 @// set pSrc as pOut1
+ M_LDR pDst,pDstOnStack
+
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+
+ CMP order,#1
+ BGT orderGreaterthan1 @// order > 1
+ vldmlt.f32 pSrc, {x0r, x0i}
+ vstmlt.f32 pDst, {x0r, x0i}
+
+ MOVLT pSrc,pDst
+ BLT FFTEnd
+
+ MOV argDst,pDst @// Set input args to fft stages
+ MOV argTwiddle,pTwiddle
+
+ BL armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ B FFTEnd
+
+
+orderGreaterthan1:
+
+ TST order, #2 @// Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+
+ @//check for even or odd order
+
+ @// NOTE: The following combination of BL's would work fine
+ @// eventhough the first BL would corrupt the flags. This is
+ @// because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
+ @// the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ BLNE armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+ CMP subFFTNum,#1
+ BEQ FFTEnd
+ BL armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ B unscaledRadix4Loop
+
+FFTEnd:
+
+ vldm.f32 pSrc, {x0r, x0i}
+
+ vmov.f32 fscale, subFFTSize
+ vcvt.f32.s32 fscale, fscale @// fscale = N as a float
+ mov round, #1
+ vmov.f32 fone, round
+ vcvt.f32.s32 fone, fone
+ vdiv.f32 fscale, fone, fscale @// fscale = 1/N
+
+scaleFFTData: @// N = subFFTSize
+ SUBS subFFTSize,subFFTSize,#1
+ vmul.f32 x0r, x0r, fscale
+ vmul.f32 x0i, x0i, fscale
+ vstm.f32 pSrc!, {x0r, x0i}
+ vldmgt.f32 pSrc, {x0r, x0i}
+
+ BGT scaleFFTData
+
+
+End:
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @// Write function tail
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S
new file mode 100644
index 00000000000..64aa5da8c5a
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S
@@ -0,0 +1,180 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of omxSP_FFTInv_CToC_SC32_Sfs_s.s
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTInv_CToC_FC32_Sfs_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+#define count r8
+#define diffMinusOne r2
+#define round r3
+
+#define x0r s0
+#define x0i s1
+#define fone s2
+#define fscale s3
+
+
+ @// Allocate stack memory required by the function
+
+ @// Write function header
+ M_START omxSP_FFTInv_CToC_FC32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+ @//MOV subFFTNum,N
+
+ CMP order,#1
+ BGT orderGreaterthan1 @// order > 1
+ @// Order = 0 or 1
+ vldmlt.f32 pSrc, {x0r, x0i}
+ vstmlt.f32 pDst, {x0r, x0i}
+
+ MOVLT pSrc,pDst
+ BLT FFTEnd
+
+ @// Handle order = 1
+ MOV argDst,pDst
+ MOV argTwiddle,pTwiddle
+
+ BL armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ B FFTEnd
+
+orderGreaterthan1:
+
+ TST order, #2 @// Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @// Pass the first stage dest in RN5
+ MOV argTwiddle,pTwiddle
+
+
+ @//check for even or odd order
+ @// NOTE: The following combination of BL's would work fine
+ @// eventhough the first BL would corrupt the flags. This is
+ @// because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
+ @// the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ BLNE armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+ CMP subFFTNum,#1
+ BEQ FFTEnd
+ BL armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ B unscaledRadix4Loop
+
+
+FFTEnd:
+
+ vldm.f32 pSrc, {x0r, x0i}
+
+ vmov.f32 fscale, subFFTSize
+ vcvt.f32.s32 fscale, fscale @// fscale = N as a float
+ movw round, #0
+ movt round, #0x3f80 @// round = 1.0
+ vmov.f32 fone, round
+ vdiv.f32 fscale, fone, fscale @// fscale = 1/N
+scaleFFTData: @// N = subFFTSize
+ SUBS subFFTSize,subFFTSize,#1
+ vmul.f32 x0r, x0r, fscale
+ vmul.f32 x0i, x0i, fscale
+ vstm.f32 pSrc, {x0r, x0i}
+ add pSrc, #8
+ vldmgt.f32 pSrc, {x0r, x0i}
+
+ bgt scaleFFTData
+
+
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @// Write function tail
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/detect.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/detect.c
new file mode 100644
index 00000000000..b74220a92fc
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/detect.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include <cpu-features.h>
+
+#include "android/log.h"
+#include "dl/sp/api/omxSP.h"
+
+int HasArmNeon() {
+ return (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
+}
+
+static void SetFFTRoutines() {
+ /*
+ * Choose the correct (NEON or non-NEON) routines for both the
+ * forward and inverse FFTs
+ */
+ if (HasArmNeon()) {
+ __android_log_print(ANDROID_LOG_INFO, "OpenMAX DL FFT",
+ "Using NEON FFT");
+ omxSP_FFTFwd_RToCCS_F32 = omxSP_FFTFwd_RToCCS_F32_Sfs;
+ omxSP_FFTInv_CCSToR_F32 = omxSP_FFTInv_CCSToR_F32_Sfs;
+ } else {
+ __android_log_print(ANDROID_LOG_INFO, "OpenMAX DL FFT",
+ "Using non-NEON FFT");
+ omxSP_FFTFwd_RToCCS_F32 = omxSP_FFTFwd_RToCCS_F32_Sfs_vfp;
+ omxSP_FFTInv_CCSToR_F32 = omxSP_FFTInv_CCSToR_F32_Sfs_vfp;
+ }
+}
+
+/*
+ * FIXME: It would be beneficial to use the GCC ifunc attribute to
+ * select the appropriate function at load time. This is apparently
+ * not supported on Android at this time. (Compiler warning that the
+ * ifunc attribute is ignored.)
+ */
+
+/*
+ * Forward FFT. Detect if NEON is supported and update function
+ * pointers to the correct routines for both the forward and inverse
+ * FFTs. Then run the forward FFT routine.
+ */
+static OMXResult DetectForwardRealFFT(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec) {
+ SetFFTRoutines();
+ return omxSP_FFTFwd_RToCCS_F32(pSrc, pDst, pFFTSpec);
+}
+
+/*
+ * Inverse FFT. Detect if NEON is supported and update function
+ * pointers to the correct routines for both the forward and inverse
+ * FFTs. Then run the inverse FFT routine.
+ */
+static OMXResult DetectInverseRealFFT(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec) {
+ SetFFTRoutines();
+ return omxSP_FFTInv_CCSToR_F32(pSrc, pDst, pFFTSpec);
+}
+
+/*
+ * Implementation of the forward and inverse real float FFT.
+ * Initialize to detection routine which will update the pointer to
+ * the correct routine and then call the correct one.
+ */
+OMXResult (*omxSP_FFTFwd_RToCCS_F32)(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec) = DetectForwardRealFFT;
+
+OMXResult (*omxSP_FFTInv_CCSToR_F32)(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec) = DetectInverseRealFFT;
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
index f375991f7dd..f9dd26e491e 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
@@ -22,8 +22,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S
new file mode 100644
index 00000000000..950defde8ca
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S
@@ -0,0 +1,409 @@
+@
+@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Some code in this file was originally from file
+@ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
+@ follows. It has been relicensed with permission from the copyright holders.
+@
+
+@
+@ OpenMAX DL: v1.0.2
+@ Last Modified Revision: 7485
+@ Last Modified Date: Fri, 21 Sep 2007
+@
+@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@
+
+@
+@ Description:
+@ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
+@ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
+@ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
+@ formula.
+@
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@//Input Registers
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+#define scale r3
+
+@ Output registers
+#define result r0
+
+@//Local Scratch Registers
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define tmpOrder r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@ Total num of radix stages to comple the FFT.
+#define count r8
+#define x0r r4
+#define x0i r5
+#define diffMinusOne r2
+#define round r3
+#define pOut1 r2
+#define size r7
+#define step r8
+#define step1 r9
+#define step2 r10
+#define twStep r10
+#define pTwiddleTmp r11
+#define argTwiddle1 r12
+#define zero r14
+
+@ Neon registers
+#define dX0 D0.S16
+#define dX0S32 D0.S32
+#define dShift D1.S16
+#define dX1 D1.S16
+#define dX1S32 D1.S32
+#define dY0 D2.S16
+#define dY1 D3.S16
+#define dX0r D0.S16
+#define dX0rS32 D0.S32
+#define dX0i D1.S16
+#define dX1r D2.S16
+#define dX1i D3.S16
+#define qX1 Q1.S16
+#define dW0r D4.S16
+#define dW0i D5.S16
+#define dW1r D6.S16
+#define dW1i D7.S16
+#define dW0rS32 D4.S32
+#define dW0iS32 D5.S32
+#define dW1rS32 D6.S32
+#define dW1iS32 D7.S32
+#define dT0 D8.S16
+#define dT1 D9.S16
+#define dT2 D10.S16
+#define dT3 D11.S16
+#define qT0 Q6.S32
+#define qT1 Q7.S32
+#define qT2 Q8.S32
+#define qT3 Q9.S32
+#define dY0r D4.S16
+#define dY0i D5.S16
+#define dY1r D6.S16
+#define dY1i D7.S16
+#define qY1 Q3.S16
+#define dY2 D4.S16
+#define dY3 D5.S16
+#define dW0 D6.S16
+#define dW1 D7.S16
+#define dW0Tmp D10.S16
+#define dW1Neg D11.S16
+
+ @ Structure offsets for the FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @ Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @ Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ MOV size,N,ASR #1 @ preserve the contents of N
+ MOV step,N,LSL #1 @ step = N/2 * 4 bytes
+
+ @ Process different FFT sizes with different loops.
+ CMP size,#4
+ BLE smallFFTSize\name
+
+ @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
+ @ Note: W^(k) is stored as negated value and also need to
+ @ conjugate the values from the table.
+
+ @ Z(0) : no need of twiddle multiply
+ @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
+
+ VLD1 dX0S32[0],[pSrc],step
+ ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes
+
+ VLD1 dX1S32[0],[pSrc]!
+ SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1
+
+ MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes
+ SUB step1,step1,#4 @ (N/4-1)*4 bytes
+
+ VHADD dY0,dX0,dX1 @ [b+d | a+c]
+ VHSUB dY1,dX0,dX1 @ [b-d | a-c]
+ VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+
+ .ifeqs "\scaled", "TRUE"
+ VHSUB dX0,dY0,dY1
+ SUBS size,size,#2
+ VHADD dX1,dY0,dY1
+ .else
+ VSUB dX0,dY0,dY1
+ SUBS size,size,#2
+ VADD dX1,dY0,dY1
+ .endif
+
+ SUB pSrc,pSrc,step
+ VST1 dX0[0],[pOut1]!
+ ADD pTwiddleTmp,pTwiddle,#4 @ W^2
+ VST1 dX1[1],[pOut1]!
+ ADD argTwiddle1,pTwiddle,twStep @ W^1
+
+ BLT decrementScale\name
+ BEQ lastElement\name
+
+ SUB step,step,#20
+ SUB step1,step1,#4 @ (N/4-1)*8 bytes
+ SUB step2, step1, #4
+
+ @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
+ @ Note: W^k is stored as negative values in the table and also need to
+ @ conjugate the values from the table.
+ @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+ @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
+
+evenOddButterflyLoop\name:
+ VLD2 {dX0r,dX0i},[pSrc],step
+ VLD2 {dX1r,dX1i},[pSrc]!
+ SUB pSrc, pSrc, step
+
+ VLD1 dW0r,[argTwiddle1],step1
+ VREV64 qX1,qX1
+ VLD1 dW1r,[argTwiddle1]!
+ VHSUB dT2,dX0r,dX1r @ a-c
+ SUB argTwiddle1, argTwiddle1, step1
+ SUB step1,step1,#16
+
+ VLD1 dW0i,[pTwiddleTmp],step2
+ VHADD dT3,dX0i,dX1i @ b+d
+ VLD1 dW1i,[pTwiddleTmp]!
+ VHADD dT0,dX0r,dX1r @ a+c
+ VHSUB dT1,dX0i,dX1i @ b-d
+ SUB pTwiddleTmp, pTwiddleTmp, step2
+ SUB step2,step2,#16
+
+ SUBS size,size,#8
+
+ VZIP dW1r,dW1i
+ VTRN dW0r,dW0i
+ VZIP dW1iS32, dW1rS32
+
+ VMULL qT0,dW1i,dT2
+ VMLSL qT0,dW1r,dT3
+ VMULL qT1,dW1i,dT3
+ VMLAL qT1,dW1r,dT2
+ VMULL qT2,dW0r,dT2
+ VMLAL qT2,dW0i,dT3
+ VMULL qT3,dW0r,dT3
+ VMLSL qT3,dW0i,dT2
+
+ VRSHRN dX1r,qT0,#15
+ VRSHRN dX1i,qT1,#15
+ VRSHRN dX0r,qT2,#15
+ VRSHRN dX0i,qT3,#15
+
+ .ifeqs "\scaled", "TRUE"
+ VHADD dY1r,dT0,dX1i @ F(N/2 -1)
+ VHSUB dY1i,dX1r,dT1
+ .else
+ VADD dY1r,dT0,dX1i @ F(N/2 -1)
+ VSUB dY1i,dX1r,dT1
+ .endif
+
+ .ifeqs "\scaled", "TRUE"
+ VHADD dY0r,dT0,dX0i @ F(1)
+ VHSUB dY0i,dT1,dX0r
+ .else
+ VADD dY0r,dT0,dX0i @ F(1)
+ VSUB dY0i,dT1,dX0r
+ .endif
+
+ VREV64 qY1,qY1
+
+ VST2 {dY0r,dY0i},[pOut1],step
+ VST2 {dY1r,dY1i},[pOut1]
+ ADD pOut1,pOut1,#16
+ SUB pOut1, pOut1, step
+ SUB step,step,#32
+
+ BGT evenOddButterflyLoop\name
+
+ SUB pSrc,pSrc,#4 @ set both the ptrs to the last element
+ SUB pOut1,pOut1,#4
+ B lastElement\name
+
+smallFFTSize\name:
+ @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
+ @ Note: W^(k) is stored as negated value and also need to
+ @ conjugate the values from the table.
+
+ @ Z(0) : no need of twiddle multiply
+ @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
+
+ VLD1 dX0S32[0],[pSrc],step
+ ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes
+
+ VLD1 dX1S32[0],[pSrc]!
+ SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1
+
+ MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes
+ SUB step1,step1,#4 @ (N/4-1)*4 bytes
+
+ VHADD dY0,dX0,dX1 @ [b+d | a+c]
+ VHSUB dY1,dX0,dX1 @ [b-d | a-c]
+ VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+
+ .ifeqs "\scaled", "TRUE"
+ VHSUB dX0,dY0,dY1
+ SUBS size,size,#2
+ VHADD dX1,dY0,dY1
+ .else
+ VSUB dX0,dY0,dY1
+ SUBS size,size,#2
+ VADD dX1,dY0,dY1
+ .endif
+
+ SUB pSrc,pSrc,step
+ VST1 dX0[0],[pOut1]!
+ ADD pTwiddleTmp,pTwiddle,#4 @ W^2
+ VST1 dX1[1],[pOut1]!
+ ADD argTwiddle1,pTwiddle,twStep @ W^1
+
+ BLT decrementScale\name
+ BEQ lastElement\name
+
+ @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
+ @ Note: W^k is stored as negative values in the table and also need to
+ @ conjugate the values from the table.
+ @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+ @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
+
+ SUB step,step,#12
+
+evenOddButterflyLoopSize4\name:
+ VLD1 dW0rS32[0],[argTwiddle1],step1
+ VLD1 dW1rS32[0],[argTwiddle1]!
+
+ VLD2 {dX0r[0],dX0i[0]},[pSrc]!
+ VLD2 {dX0r[1],dX0i[1]},[pSrc],step
+ SUB pSrc,pSrc,#4
+ SUB argTwiddle1,argTwiddle1,step1
+ VLD2 {dX1r[0],dX1i[0]},[pSrc]!
+ VLD2 {dX1r[1],dX1i[1]},[pSrc]!
+
+ SUB step1,step1,#4 @ (N/4-2)*4 bytes
+ VLD1 dW0iS32[0],[pTwiddleTmp],step1
+ VLD1 dW1iS32[0],[pTwiddleTmp]!
+ SUB pSrc,pSrc,step
+
+ SUB pTwiddleTmp,pTwiddleTmp,step1
+ VREV32 dX1r,dX1r
+ VREV32 dX1i,dX1i
+ SUBS size,size,#4
+
+ VHSUB dT2,dX0r,dX1r @ a-c
+ VHADD dT3,dX0i,dX1i @ b+d
+ SUB step1,step1,#4
+ VHADD dT0,dX0r,dX1r @ a+c
+ VHSUB dT1,dX0i,dX1i @ b-d
+
+ VTRN dW1r,dW1i
+ VTRN dW0r,dW0i
+
+ VMULL qT0,dW1r,dT2
+ VMLSL qT0,dW1i,dT3
+ VMULL qT1,dW1r,dT3
+ VMLAL qT1,dW1i,dT2
+ VMULL qT2,dW0r,dT2
+ VMLAL qT2,dW0i,dT3
+ VMULL qT3,dW0r,dT3
+ VMLSL qT3,dW0i,dT2
+
+ VRSHRN dX1r,qT0,#15
+ VRSHRN dX1i,qT1,#15
+
+ .ifeqs "\scaled", "TRUE"
+ VHADD dY1r,dT0,dX1i @ F(N/2 -1)
+ VHSUB dY1i,dX1r,dT1
+ .else
+ VADD dY1r,dT0,dX1i @ F(N/2 -1)
+ VSUB dY1i,dX1r,dT1
+ .endif
+
+ VREV32 dY1r,dY1r
+ VREV32 dY1i,dY1i
+
+ VRSHRN dX0r,qT2,#15
+ VRSHRN dX0i,qT3,#15
+
+ .ifeqs "\scaled", "TRUE"
+ VHADD dY0r,dT0,dX0i @ F(1)
+ VHSUB dY0i,dT1,dX0r
+ .else
+ VADD dY0r,dT0,dX0i @ F(1)
+ VSUB dY0i,dT1,dX0r
+ .endif
+
+ VST2 {dY0r[0],dY0i[0]},[pOut1]!
+ VST2 {dY0r[1],dY0i[1]},[pOut1],step
+ SUB pOut1, #4
+ VST2 {dY1r[0],dY1i[0]},[pOut1]!
+ VST2 {dY1r[1],dY1i[1]},[pOut1]!
+ SUB pOut1,pOut1,step
+ SUB pSrc,pSrc,#4 @ set both the ptrs to the last element
+ SUB pOut1,pOut1,#4
+
+ @ Last element can be expanded as follows
+ @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
+ @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+ @ 1/2[2a+j0] - j (c-jd) [0+j2b]
+ @ (a+bc, -bd)
+ @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name:
+ VLD1 dX0rS32[0],[pSrc]
+
+ .ifeqs "\scaled", "TRUE"
+ VSHR dX0r,dX0r,#1
+ .endif
+
+ VST1 dX0r[0],[pOut1]!
+ VNEG dX0r,dX0r
+ VST1 dX0r[1],[pOut1]
+
+decrementScale\name:
+ .ifeqs "\scaled", "TRUE"
+ SUB scale,scale,#1
+ .endif
+
+ .endm
+
+ M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
+ FFTSTAGE "FALSE","TRUE",Inv
+ M_END
+
+ M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
+ FFTSTAGE "TRUE","TRUE",InvSfs
+ M_END
+
+
+ .end
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S
index 57fef7a9404..9959f8fdde8 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S
@@ -30,8 +30,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
index 323eb8319da..88a08ff3fab 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
@@ -21,8 +21,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S
index 02f3888c56f..85b85295076 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S
@@ -21,8 +21,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S
index 73c1f4b82f3..20c35e15651 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S
@@ -21,8 +21,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
index ff62dd132b8..dbe170c62e0 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
@@ -21,8 +21,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S
index 9d2e4ab8b44..af86b919a8b 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S
@@ -20,8 +20,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
index ae450c5f629..8f63eb8510f 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
@@ -21,8 +21,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
index 4447e76b1f7..19a2f253dc0 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
@@ -20,8 +20,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
index a16c79f75eb..4bdbb52c914 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
index 9f7b531d300..94b3d49e848 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
index 666f4f349a7..2b34d997341 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
index f9bbebcca91..17e0415e822 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
index cdb42a994a1..049621bfabc 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
@@ -142,7 +142,6 @@
RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
- VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3]
MOV subFFTSize,#4 @// subFFTSize = 1 for the first stage
@@ -158,6 +157,7 @@
grpZeroSetLoop\name:
+ VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3]
.ifeqs "\scaled", "TRUE"
@@ -178,9 +178,6 @@ grpZeroSetLoop\name:
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VHADD qZ0,qY0,qY1 @// y0
- VLD2 {dXr3,dXi3},[pSrc :128],setStep
-
-
.ifeqs "\inverse", "TRUE"
VHSUB dZr3,dYr2,dYi3 @// y3
@@ -235,9 +232,6 @@ grpZeroSetLoop\name:
VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
VADD qZ0,qY0,qY1 @// y0
- VLD2 {dXr3,dXi3},[pSrc :128],setStep
-
-
.ifeqs "\inverse", "TRUE"
VSUB dZr3,dYr2,dYi3 @// y3
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
index 23e2c373d62..4e46a010641 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@@ -163,7 +163,6 @@
@// Define stack arguments
MOV pw2,pTwiddle
- VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
MOV pw3,pTwiddle
MOV pw1,pTwiddle
@@ -171,42 +170,47 @@
@// pOut0+outPointStep == increment of 4*outPointStep bytes
MOV outPointStep,subFFTSize,LSL #2
- VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
MOV subFFTNum,#1 @//after the last stage
LSL grpCount,subFFTSize,#2
@// Update grpCount and grpSize rightaway
- VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
@// update subFFTSize for the next stage
MOV subFFTSize,grpCount
MOV dstStep,outPointStep,LSL #1
- VLD2 {dW1r,dW1i}, [pw1 :128]!
-
-
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
- VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
- VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
-
@// Process 4 groups at a time
grpLoop\name:
+ VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+ VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+ @// Load the second twiddle for 4 groups : w^2
+ @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6) for group 0,1,2,3
+ VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
- @// Rearrange the third twiddle
- VUZP dW3r,dW3i
- SUBS grpCount,grpCount,#16 @// grpCount is multiplied by 4
+ VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r
+ @// Load the third twiddle for 4 groups : w^3
+ @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9) for group 0,1,2,3
+ VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
- VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r
VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i
VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r
+
+ VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
+
VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i
+ VLD2 {dW1r,dW1i}, [pw1 :128]!
+
+ @// Rearrange the third twiddle
+ VUZP dW3r,dW3i
+ SUBS grpCount,grpCount,#16 @// grpCount is multiplied by 4
.ifeqs "\inverse", "TRUE"
VMULL qT0,dXr1,dW1r
@@ -225,8 +229,6 @@ grpLoop\name:
@// Load the first twiddle for 4 groups : w^1
@// w^1 twiddle (i+0,i+1,i+2,i+3) for group 0,1,2,3
- VLD2 {dW1r,dW1i}, [pw1 :128]!
-
.ifeqs "\inverse", "TRUE"
VMULL qT2,dXr2,dW2r
VMLAL qT2,dXi2,dW2i @// real part
@@ -260,24 +262,12 @@ grpLoop\name:
.ENDIF
- @// Load the second twiddle for 4 groups : w^2
- @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6) for group 0,1,2,3
- VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
-
-
VRSHRN dZr2,qT2,#15
VRSHRN dZi2,qT3,#15
- @// Load the third twiddle for 4 groups : w^3
- @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9) for group 0,1,2,3
-
- VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
-
VRSHRN dZr3,qT0,#15
VRSHRN dZi3,qT1,#15
- VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
-
.ifeqs "\scaled", "TRUE"
@// finish first stage of 4 point FFT
@@ -285,7 +275,6 @@ grpLoop\name:
VHADD qY0,qX0,qZ2
VHSUB qY2,qX0,qZ2
VHADD qY1,qZ1,qZ3
- VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
VHSUB qY3,qZ1,qZ3
@@ -293,7 +282,6 @@ grpLoop\name:
VHSUB qZ0,qY2,qY1
VHADD qZ2,qY2,qY1
- VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
.ifeqs "\inverse", "TRUE"
@@ -329,7 +317,6 @@ grpLoop\name:
VADD qY0,qX0,qZ2
VSUB qY2,qX0,qZ2
VADD qY1,qZ1,qZ3
- VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
VSUB qY3,qZ1,qZ3
@@ -337,7 +324,6 @@ grpLoop\name:
VSUB qZ0,qY2,qY1
VADD qZ2,qY2,qY1
- VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
.ifeqs "\inverse", "TRUE"
@@ -376,7 +362,6 @@ grpLoop\name:
@// Reset and Swap pSrc and pDst for the next stage
MOV pTmp,pDst
- SUB pSrc,pSrc,#64 @// Extra increment currently done in the loop
SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= size; pSrc -= 4*size bytes
SUB pSrc,pTmp,outPointStep
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
index 0eba3856f2a..7bdbe41e08d 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@@ -154,7 +154,6 @@
MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep
VLD1 dW2,[pTwiddle :64] @//[wi | wr]
ADD setStep,srcStep,pointStep @// setStep = 3*pointStep
- SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16
VLD1 dW3,[pTwiddle :64]
@//RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
RSB setStep,setStep,#0 @// setStep = - 3*pointStep
@@ -167,26 +166,23 @@
grpLoop\name:
- VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
ADD stepTwiddle,stepTwiddle,pointStep
- VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point
- VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
MOV twStep,stepTwiddle,LSL #2
- VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & reset pSrc
SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle
MOV setCount,pointStep,LSR #2
- ADD pSrc,pSrc,#16 @// set pSrc to data[0] of the next set
- ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set
+ ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set
@// Loop on the sets : 4 at a time
setLoop\name:
+ VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
+ VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
- SUBS setCount,setCount,#4 @// decrement the loop counter
+ SUBS setCount,setCount,#4 @// decrement the loop counter
.ifeqs "\inverse", "TRUE"
VMULL qT0,dXr1,dW1[0]
@@ -202,8 +198,6 @@ setLoop\name:
.ENDIF
- VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
-
.ifeqs "\inverse", "TRUE"
VMULL qT2,dXr2,dW2[0]
VMLAL qT2,dXi2,dW2[1] @// real part
@@ -218,11 +212,13 @@ setLoop\name:
.ENDIF
+ VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
+
VRSHRN dZr1,qT0,#15
VRSHRN dZi1,qT1,#15
-
- VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
+ VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
+ ADD pSrc,pSrc,#16 @// set pSrc to data[1] of the next set
.ifeqs "\inverse", "TRUE"
VMULL qT0,dXr3,dW3[0]
@@ -244,7 +240,6 @@ setLoop\name:
VRSHRN dZr3,qT0,#15
VRSHRN dZi3,qT1,#15
- VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
.ifeqs "\scaled", "TRUE"
@@ -253,7 +248,6 @@ setLoop\name:
VHADD qY0,qX0,qZ2
VHSUB qY2,qX0,qZ2
- VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0]
VHADD qY1,qZ1,qZ3
VHSUB qY3,qZ1,qZ3
@@ -303,7 +297,6 @@ setLoop\name:
VADD qY0,qX0,qZ2
VSUB qY2,qX0,qZ2
- VLD2 {dXr0,dXi0},[pSrc]! @// data[0]
VADD qY1,qZ1,qZ3
VSUB qY3,qZ1,qZ3
@@ -351,7 +344,6 @@ setLoop\name:
.ENDIF
- ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set
BGT setLoop\name
VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
index 588c3197db9..f9ff37a275d 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@@ -233,12 +233,12 @@
VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
- VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set
- @// setStep = -7*pointStep + 16
@// grp = 0 a special case since all the twiddle factors are 1
@// Loop on the sets : 4 sets at a time
grpZeroSetLoop\name:
+ VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set
+ @// setStep = -7*pointStep + 16
@// Decrement setcount
SUBS setCount,setCount,#4 @// decrement the set loop counter
@@ -348,9 +348,6 @@ grpZeroSetLoop\name:
VSUB dVi7,dVi7,dT1
SUB pDst, pDst, step2 @// set pDst to y1
- VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
-
-
VHSUB dYr3,dVr3,dVr7
VHSUB dYi3,dVi3,dVi7
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
@@ -388,7 +385,6 @@ grpZeroSetLoop\name:
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
- VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VHSUB qY5,qV1,qV5
@@ -514,9 +510,6 @@ grpZeroSetLoop\name:
VSUB dVi7,dVi7,dT1
SUB pDst, pDst, step2 @// set pDst to y1
- VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
-
-
VSUB dYr3,dVr3,dVr7
VSUB dYi3,dVi3,dVi7
VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
@@ -554,7 +547,6 @@ grpZeroSetLoop\name:
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
- VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
VSUB qY5,qV1,qV5
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
index 3bc5f02a743..de589c95fa5 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
index 30a8f56b487..eeb8c6eb289 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
@@ -30,8 +30,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S
index a9700ec3eab..967d7b59750 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
index 685f85b6f6e..412b64fb59a 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S
index 1b5478b2503..91e5299e071 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S
@@ -28,8 +28,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
index 3c23983efee..22efea45b0b 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
@@ -30,8 +30,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
index a5fb0e27105..d4d4abb4c21 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
@@ -30,8 +30,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
index da0c10f1f66..aa761126a82 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
@@ -20,8 +20,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
index ca15c6b06cb..a3c21ac015d 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC32_Sfs_s.S
index 90f969a83d5..504ef955d24 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_CToC_SC32_Sfs_s.S
@@ -27,8 +27,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
index fda1ae4a16e..fda446cc896 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
@@ -20,8 +20,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S
index 84d230036fc..402885fa8fb 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S
@@ -28,8 +28,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S
new file mode 100644
index 00000000000..e9530774cdf
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S
@@ -0,0 +1,639 @@
+@
+@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Some code in this file was originally from file
+@ omxSP_FFTFwd_RToCCS_S32_Sfs_s.S which was licensed as follows.
+@ It has been relicensed with permission from the copyright holders.
+@
+
+@
+@ OpenMAX DL: v1.0.2
+@ Last Modified Revision: 7810
+@ Last Modified Date: Thu, 04 Oct 2007
+@
+@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@
+
+@
+@ Description:
+@ Compute a forward FFT for a real signal, using 16 bit complex FFT routines.
+@
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@Input Registers
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+#define scale r3
+
+@ Output registers
+#define result r0
+
+@Local Scratch Registers
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define tmpOrder r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@ Total num of radix stages to comple the FFT
+#define count r8
+#define x0r r4
+#define x0i r5
+#define diffMinusOne r2
+#define round r3
+#define subFFTSizeTmp r6
+#define step r3
+#define stepr r11
+#define step1 r10
+#define step1r r6
+#define step2 r8
+#define step2r r9
+#define twStep r8
+#define zero r9
+#define pTwiddleTmp r5
+#define t0 r10
+
+@ Neon registers
+#define dX0 d0.s16
+#define dX0S32 d0.s32
+#define dzero d1.s16
+#define dZero d2.s16
+#define dShift d3.s16
+#define qShift q1.s16
+#define dX0r d2.s16
+#define dX0i d3.s16
+#define dX1r d4.s16
+#define dX1i d5.s16
+#define qX1 q2.s16
+#define dX0rS32 d2.s32
+#define dX0iS32 d3.s32
+#define dX1rS32 d4.s32
+#define dX1iS32 d5.s32
+#define dT0 d6.s16
+#define dT1 d7.s16
+#define dT2 d8.s16
+#define dT3 d9.s16
+#define qT0 q5.s32
+#define qT1 q6.s32
+#define qT0s q5.s16
+#define qT1s q6.s16
+#define dW0r d14.s16
+#define dW0i d15.s16
+#define dW1r d16.s16
+#define dW1i d17.s16
+#define dW0rS32 d14.s32
+#define dW0iS32 d15.s32
+#define dW1rS32 d16.s32
+#define dW1iS32 d17.s32
+#define dY0r d14.s16
+#define dY0i d15.s16
+#define dY0rS32 d14.s32
+#define dY0iS32 d15.s32
+#define dY1r d16.s16
+#define dY1i d17.s16
+#define qY1 q8.s16
+#define dY1rS32 d16.s32
+#define dY1iS32 d17.s32
+#define dY0rS64 d14.s32
+#define dY0iS64 d15.s32
+#define qT2 q9.s32
+#define qT3 q10.s32
+#define d18s16 d18.s16
+#define d19s16 d19.s16
+#define d20s16 d20.s16
+#define d21s16 d21.s16
+@ lastThreeelements
+#define dX1 d3.s16
+#define dW0 d4.s16
+#define dW1 d5.s16
+#define dY0 d10.s16
+#define dY1 d11.s16
+#define dY2 d12.s16
+#define dY3 d13.s16
+
+ @ Allocate stack memory required by the function
+ M_ALLOC4 diffOnStack, 4
+
+ @ Write function header
+ M_START omxSP_FFTFwd_RToCCS_S16_Sfs,r11,d15
+
+ @ Structure offsets for the FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @ Define stack arguments
+
+ @ Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @ Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ @ Do a N/2 point complex FFT including the scaling
+
+ MOV N,N,ASR #1 @ N/2 point complex FFT
+
+ CLZ order,N @ N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+
+ CMP order,#3
+ BGT orderGreaterthan3 @ order > 3
+
+ CMP order,#1
+ BGE orderGreaterthan0 @ order > 0
+ M_STR scale, diffOnStack,LT @ order = 0
+ LDR x0r,[pSrc]
+ STR x0r,[pOut]
+ MOV pSrc,pOut
+ MOV argDst,pDst
+ B FFTEnd
+
+orderGreaterthan0:
+ @ set the buffers appropriately for various orders
+ CMP order,#2
+ MOVEQ argDst,pDst
+ MOVNE argDst,pOut
+ MOVNE pOut,pDst @ Pass 1st stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ SUBS diff,scale,order
+ M_STR diff,diffOnStack
+ MOVGT scale,order
+ @ Now scale <= order
+
+ CMP order,#1
+ BGT orderGreaterthan1
+ @ order = 1:
+ SUBS scale,scale,#1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ B FFTEnd
+
+orderGreaterthan1:
+ CMP order,#2
+ MOV argScale,scale
+ BGT orderGreaterthan2
+ @ order = 2:
+ SUBS argScale,argScale,#1
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+orderGreaterthan2: @ order = 3
+ SUBS argScale,argScale,#1
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+orderGreaterthan3:
+ @ check scale = 0 or scale = order
+ SUBS diff, scale, order @ scale > order
+ MOVGT scale,order
+ BGE specialScaleCase @ scale = 0 or scale = order
+ CMP scale,#0
+ BEQ specialScaleCase
+ B generalScaleCase
+
+specialScaleCase: @ scale = 0, or, scale = order && order > 3
+ TST order, #2 @ Set input args to fft stages
+ MOVEQ argDst,pDst
+ MOVNE argDst,pOut
+ MOVNE pOut,pDst @ Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#0
+ M_STR diff, diffOnStack
+ BGE scaleEqualsOrder
+
+ @ check for even or odd order.
+ @ NOTE: The following combination of BL's would work fine even though
+ @ the first BL would corrupt the flags. This is because the end of the
+ @ "grpZeroSetLoop" loop inside
+ @ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+unscaledRadix4Loop:
+ BEQ lastStageUnscaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+ BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+scaleEqualsOrder:
+ @ check for even or odd order
+ @ NOTE: The following combination of BL's would work fine even though
+ @ the first BL would corrupt the flags. This is because the end of the
+ @ "grpZeroSetLoop" loop inside
+ @ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+scaledRadix4Loop:
+ BEQ lastStageScaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B scaledRadix4Loop
+
+lastStageScaledRadix4:
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+generalScaleCase: @ 0 < scale < order and order > 3
+ @ Determine the correct destination buffer
+ SUB diff,order,scale
+ TST diff,#0x01
+ ADDEQ count,scale,diff,LSR #1 @ count = scale + (order - scale)/2
+ MOVNE count,order
+ TST count,#0x01 @ Is count even or odd ?
+
+ MOVEQ argDst,pDst @ Set input args to fft stages
+ MOVNE argDst,pOut
+ MOVNE pOut,pDst @ Pass 1st stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#1
+ M_STR diff, diffOnStack
+ BEQ scaleps @ scaling including a radix2_ps stage
+
+ MOV argScale,scale @ Put scale in RN4 to save and restore
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+
+scaledRadix2Loop:
+ BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1 @ save, restore scale in scaled stages
+ BGT scaledRadix2Loop
+ B outScale
+
+scaleps:
+ SUB argScale,scale,#1 @ order>3 and diff=1 => scale >= 3
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+
+scaledRadix2psLoop:
+ BEQ scaledRadix2psStage
+ BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1 @ save, restore scale in scaled stages
+ BGE scaledRadix2psLoop
+
+scaledRadix2psStage:
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ B generalLastStageUnscaledRadix2
+
+outScale:
+ M_LDR diff, diffOnStack
+ @check for even or odd order
+ TST diff,#0x00000001
+ BEQ generalUnscaledRadix4Loop
+ B unscaledRadix2Loop
+
+generalUnscaledRadix4Loop:
+ CMP subFFTNum,#4
+ BEQ generalLastStageUnscaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+ B generalUnscaledRadix4Loop
+
+generalLastStageUnscaledRadix4:
+ BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B End
+
+unscaledRadix2Loop:
+ CMP subFFTNum,#4
+ BEQ generalLastTwoStagesUnscaledRadix2
+ BL armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+ B unscaledRadix2Loop
+
+generalLastTwoStagesUnscaledRadix2:
+ BL armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2:
+ BL armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B End
+
+FFTEnd: @ Does only the scaling
+ M_LDR diff, diffOnStack
+ CMP diff,#0
+ BLE finalComplexToRealFixup
+
+ RSB diff,diff,#0 @ for right shift by a variable
+ VDUP qShift,diff
+
+ @ save subFFTSize and use subFFTSizeTmp in the following loop
+ MOV subFFTSizeTmp,subFFTSize @ subFFTSizeTmp same reg as subFFTNum
+
+ @ Use parallel loads for bigger FFT size.
+ CMP subFFTSizeTmp, #8
+ BLT scaleLessFFTData
+
+scaleFFTData:
+ VLD1 {qT0s, qT1s},[pSrc:256] @ pSrc contains pDst pointer
+ SUBS subFFTSizeTmp,subFFTSizeTmp,#8
+ VSHL qT0s,qShift
+ VSHL qT1s,qShift
+ VST1 {qT0s, qT1s},[pSrc:256]!
+ BGT scaleFFTData
+ B afterScaling
+
+scaleLessFFTData:
+ VLD1 {dX0S32[0]},[pSrc] @ pSrc contains pDst pointer
+ SUBS subFFTSizeTmp,subFFTSizeTmp,#1
+ VSHL dX0,dShift
+ VST1 {dX0S32[0]},[pSrc]!
+ BGT scaleLessFFTData
+
+afterScaling:
+ SUB pSrc,pSrc,subFFTSize,LSL #2 @ reset pSrc for final fixup
+
+ @ change the logic so that output after scaling is in pOut and not in pDst
+ @ finally store from pOut to pDst
+ @ change branch "End" to branch "finalComplexToRealFixup" in the above
+ @ chk the code below for multiplication by j factor
+
+finalComplexToRealFixup:
+ @ F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+ @ 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
+ @ 1/2[2a+j0] - j [0+j2b]
+ @ (a+b, 0)
+
+ @ F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+ @ 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
+ @ 1/2[2a+j0] + j [0+j2b]
+ @ (a-b, 0)
+
+ CMP subFFTSize,#4
+ BLE smallFFTSize
+
+@ SubSize > 3:
+ @ F(0) and F(N/2)
+ VLD2 {dX0r[0],dX0i[0]},[pSrc]!
+ MOV zero,#0
+ VMOV dX0r[1],zero
+ MOV step,subFFTSize,LSL #2 @ step = N/2 * 4 bytes
+ VMOV dX0i[1],zero
+ SUB twStep,step,subFFTSize @ twStep = 3N/8 * 8 bytes
+
+ VADD dY0r,dX0r,dX0i @ F(0) = ((Z0.r+Z0.i) , 0)
+ MOV step1,subFFTSize,LSL #1 @ step1 = N/2 * 2 bytes
+ VSUB dY0i,dX0r,dX0i @ F(N/2) = ((Z0.r-Z0.i) , 0)
+ SUBS subFFTSize,subFFTSize,#2
+
+ VST1 dY0rS32[0],[argDst], step
+ ADD pTwiddleTmp,argTwiddle,#4 @ W^2
+ VST1 dY0iS32[0],[argDst]!
+ ADD argTwiddle,argTwiddle,twStep @ W^1
+
+ VDUP dzero,zero
+ SUB argDst,argDst,step
+ SUB step,step,#20
+ RSB stepr, step, #16
+ SUB step1,step1,#8 @ (N/4-1)*8 bytes
+ RSB step1r,step1,#8
+
+ SUB step2, step1, #4
+ RSB step2r, step2, #8
+
+ @ F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
+ @ Note: W^k is stored as negative values in the table.
+ @ Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+ @ since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1).
+
+evenOddButterflyLoop:
+ VLD2 {dX0r,dX0i},[pSrc],step
+ VLD2 {dX1r,dX1i},[pSrc],stepr
+
+ VLD1 dW0r,[argTwiddle],step1
+ SUB step1, step1, #16
+ VREV64 qX1,qX1
+
+ VLD1 dW1r,[argTwiddle],step1r
+ ADD step1r, step1r, #16
+ VSUB dT2,dX0r,dX1r @ a-c
+
+ VLD1 dW0i,[pTwiddleTmp],step2
+ SUB step2, step2, #16
+ VADD dT3,dX0i,dX1i @ b+d
+
+ VLD1 dW1i,[pTwiddleTmp],step2r
+ ADD step2r, step2r, #16
+
+ VTRN dW0r,dW0i
+ VZIP dW1r, dW1i
+
+ SUBS subFFTSize,subFFTSize,#8
+
+ VHADD dT0,dX0r,dX1r @ (a+c)/2
+ VZIP dW1iS32, dW1rS32
+ VHSUB dT1,dX0i,dX1i @ (b-d)/2
+
+ VQDMULH dY0,dW1i,dT2
+ VQDMULH dY1,dW1r,dT3
+ VQDMULH dY2,dW1i,dT3
+ VQDMULH dY3,dW1r,dT2
+
+ VQDMULH d18s16,dW0r,dT2
+ VQDMULH d19s16,dW0i,dT3
+ VQDMULH d20s16,dW0r,dT3
+ VQDMULH d21s16,dW0i,dT2
+
+ VRHADD dX1r, dY0, dY1
+ VHSUB dX1i, dY2, dY3
+ VHSUB dX0r, d18s16, d19s16
+ VADD dY1i,dT1,dX1r
+ VRHADD dX0i, d20s16, d21s16
+ VSUB dY1r,dT0,dX1i @ F(N/2 -1)
+ VSUB dY0r,dT0,dX0i @ F(1)
+ VADD dY0i,dT1,dX0r
+
+ VNEG dY1i,dY1i
+ VREV64 qY1, qY1
+
+ VST2 {dY0r,dY0i},[argDst],step
+ SUB step,step,#32 @ (N/2-4)*4 bytes
+ VST2 {dY1r,dY1i},[argDst],stepr
+ ADD stepr,stepr,#32
+
+ BGT evenOddButterflyLoop
+
+ SUB pSrc,pSrc,#4 @ points to the last element.
+ SUB argDst,argDst,#4 @ points to the last element.
+
+ b lastElement
+
+smallFFTSize:
+
+ @ F(0) and F(N/2)
+ VLD2 {dX0r[0],dX0i[0]},[pSrc]!
+ MOV zero,#0
+ VMOV dX0r[1],zero
+ MOV step,subFFTSize,LSL #2 @ step = N/2 * 4 bytes
+ VMOV dX0i[1],zero
+ SUB twStep,step,subFFTSize @ twStep = 3N/8 * 8 bytes
+
+ VADD dY0r,dX0r,dX0i @ F(0) = ((Z0.r+Z0.i) , 0)
+ MOV step1,subFFTSize,LSL #1 @ step1 = N/2 * 2 bytes
+ VSUB dY0i,dX0r,dX0i @ F(N/2) = ((Z0.r-Z0.i) , 0)
+ SUBS subFFTSize,subFFTSize,#2
+
+
+ VST1 dY0rS32[0],[argDst], step
+ ADD pTwiddleTmp,argTwiddle,#4 @ W^2
+ VST1 dY0iS32[0],[argDst]!
+ ADD argTwiddle,argTwiddle,twStep @ W^1
+
+ VDUP dzero,zero
+ SUB argDst,argDst,step
+
+ BLT End
+ BEQ lastElement
+
+ SUB step,step,#12
+ SUB step1,step1,#4 @ (N/4-1)*8 bytes
+
+ @ F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
+
+butterflyLoopSubFFTSize4:
+ VLD1 dW0rS32[0], [argTwiddle],step1
+ VLD1 dW1rS32[0],[argTwiddle]!
+
+ VLD2 {dX0r[0],dX0i[0]},[pSrc]!
+ VLD2 {dX0r[1],dX0i[1]},[pSrc],step
+ SUB pSrc,pSrc,#4
+ SUB argTwiddle,argTwiddle,step1
+ VLD2 {dX1r[0],dX1i[0]},[pSrc]!
+ VLD2 {dX1r[1],dX1i[1]},[pSrc]!
+
+ SUB step1,step1,#4 @ (N/4-2)*4 bytes
+ VLD1 dW0iS32[0],[pTwiddleTmp],step1
+ VLD1 dW1iS32[0],[pTwiddleTmp]!
+ SUB pSrc,pSrc,step
+
+ SUB pTwiddleTmp,pTwiddleTmp,step1
+ VREV32 dX1r,dX1r
+ VREV32 dX1i,dX1i
+ SUBS subFFTSize,subFFTSize,#4
+
+ VSUB dT2,dX0r,dX1r @ a-c
+ SUB step1,step1,#4
+ VADD dT3,dX0i,dX1i @ b+d
+ VADD dT0,dX0r,dX1r @ a+c
+ VSUB dT1,dX0i,dX1i @ b-d
+ VHADD dT0,dT0,dzero
+ VHADD dT1,dT1,dzero
+
+ VTRN dW1r,dW1i
+ VTRN dW0r,dW0i
+
+ VMULL qT0,dW1r,dT2
+ VMLAL qT0,dW1i,dT3
+ VMULL qT1,dW1r,dT3
+ VMLSL qT1,dW1i,dT2
+
+ VMULL qT2,dW0r,dT2
+ VMLSL qT2,dW0i,dT3
+ VMULL qT3,dW0r,dT3
+ VMLAL qT3,dW0i,dT2
+
+ VRSHRN dX1r,qT0,#16
+ VRSHRN dX1i,qT1,#16
+
+ VSUB dY1r,dT0,dX1i @ F(N/2 -1)
+ VADD dY1i,dT1,dX1r
+ VNEG dY1i,dY1i
+
+ VREV32 dY1r,dY1r
+ VREV32 dY1i,dY1i
+
+ VRSHRN dX0r,qT2,#16
+ VRSHRN dX0i,qT3,#16
+
+ VSUB dY0r,dT0,dX0i @ F(1)
+ VADD dY0i,dT1,dX0r
+
+ VST2 {dY0r[0],dY0i[0]},[argDst]!
+ VST2 {dY0r[1],dY0i[1]},[argDst],step
+ SUB argDst, #4
+ VST2 {dY1r[0],dY1i[0]},[argDst]!
+ VST2 {dY1r[1],dY1i[1]},[argDst]!
+ SUB argDst,argDst,step
+ SUB pSrc,pSrc,#4 @ points to the last element.
+ SUB argDst,argDst,#4 @ points to the last element.
+
+lastElement:
+ @ Last element can be expanded as follows
+ @ 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+ @ 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+ @ 1/2[2a+j0] + j (c+jd) [0+j2b]
+ @ (a-bc, -bd)
+ @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+ VLD1 dX0rS32[0],[pSrc]
+ VST1 dX0r[0],[argDst]!
+ VNEG dX0r,dX0r
+ VST1 dX0r[1],[argDst]!
+
+End:
+ @ Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @ Write function tail
+ M_END
+
+ .END
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S
index a742162e616..c1385c025ed 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
index 5deaf896c53..9c45b54cdc1 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
@@ -20,8 +20,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S
new file mode 100644
index 00000000000..311dba99e83
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S
@@ -0,0 +1,301 @@
+@
+@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Some code in this file was originally from file
+@ omxSP_FFTInv_CToC_SC16_Sfs_s.S which was licensed as follows.
+@ It has been relicensed with permission from the copyright holders.
+@
+
+@
+@ File Name: omxSP_FFTInv_CToC_SC16_Sfs_s.s
+@ OpenMAX DL: v1.0.2
+@ Last Modified Revision: 6729
+@ Last Modified Date: Tue, 17 Jul 2007
+@
+@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@
+
+@
+@ Description:
+@ Compute an inverse FFT for a 16-bit real signal, with complex FFT routines.
+@
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@Input Registers
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+#define scale r3
+
+@ Output registers
+#define result r0
+
+@Local Scratch Registers
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define tmpOrder r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@ Total num of radix stages to comple the FFT
+#define count r8
+#define x0r r4
+#define x0i r5
+#define diffMinusOne r2
+#define round r3
+#define pOut1 r2
+#define size r7
+#define step r8
+#define step1 r9
+#define twStep r10
+#define pTwiddleTmp r11
+#define argTwiddle1 r12
+#define zero r14
+
+@ Neon registers
+#define dX0 D0.S32
+#define dShift D1.S32
+#define qShift Q0.s16
+#define dX1 D1.S32
+#define dY0 D2.S32
+#define dY1 D3.S32
+#define dX0r D0.S32
+#define dX0i D1.S32
+#define dX1r D2.S32
+#define dX1i D3.S32
+#define dW0r D4.S32
+#define dW0i D5.S32
+#define dW1r D6.S32
+#define dW1i D7.S32
+#define dT0 D8.S32
+#define dT1 D9.S32
+#define dT2 D10.S32
+#define dT3 D11.S32
+#define qT0 Q6.S64
+#define qT1 Q7.S64
+#define qT0s Q6.S16
+#define qT1s Q7.S16
+#define qT2 Q8.S64
+#define qT3 Q9.S64
+#define dY0r D4.S32
+#define dY0i D5.S32
+#define dY1r D6.S32
+#define dY1i D7.S32
+#define dzero D20.S32
+#define dY2 D4.S32
+#define dY3 D5.S32
+#define dW0 D6.S32
+#define dW1 D7.S32
+#define dW0Tmp D10.S32
+#define dW1Neg D11.S32
+
+
+
+ @ Allocate stack memory required by the function
+ M_ALLOC4 diffOnStack, 4
+
+ @ Write function header
+ M_START omxSP_FFTInv_CCSToR_S16_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @ Define stack arguments
+
+ @ Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @ Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ @ Call the preTwiddle Radix2 stage before doing the complex IFFT
+
+ @ The following conditional BL combination would work since
+ @ evenOddButterflyLoop in the first call would set Z flag to zero
+
+ CMP scale,#0
+ BLEQ armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe
+ BLGT armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe
+
+complexIFFT:
+
+ ASR N,N,#1 @ N/2 point complex IFFT
+ ADD pSrc,pOut,N,LSL #2 @ set pSrc as pOut1
+
+ CLZ order,N @ N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+
+ ADD scale,scale,order @ FFTInverse has a final scaling factor by N
+
+ CMP order,#3
+ BGT orderGreaterthan3 @ order > 3
+
+ CMP order,#1
+ BGE orderGreaterthan0 @ order > 0
+ M_STR scale, diffOnStack,LT @ order = 0
+ LDRLT x0r,[pSrc]
+ STRLT x0r,[pDst]
+ MOVLT pSrc,pDst
+ BLT FFTEnd
+
+orderGreaterthan0:
+ @ set the buffers appropriately for various orders
+ CMP order,#2
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @ Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+ @ Store the scale factor and scale at the end
+ SUB diff,scale,order
+ M_STR diff, diffOnStack
+ BGE orderGreaterthan1
+ BLLT armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @ order = 1
+ B FFTEnd
+
+
+orderGreaterthan1:
+ MOV tmpOrder,order @ tmpOrder = RN 4
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ CMP tmpOrder,#2
+ BLGT armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+
+
+orderGreaterthan3:
+ @ check scale = 0 or scale = order
+ SUB diff, scale, order @ scale > order
+
+ TST order, #2 @ Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @ Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#0
+ M_STR diff, diffOnStack
+ BGE scaleEqualsOrder
+
+ @check for even or odd order
+ @ NOTE: The following combination of BL's would work fine eventhough the first
+ @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ @ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+unscaledRadix4Loop:
+ BEQ lastStageUnscaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+ BL armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+scaleEqualsOrder:
+ @check for even or odd order
+ @ NOTE: The following combination of BL's would work fine eventhough the first
+ @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ @ armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+scaledRadix4Loop:
+ BEQ lastStageScaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B scaledRadix4Loop
+
+lastStageScaledRadix4:
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+
+FFTEnd: @ Does only the scaling
+
+ M_LDR diff, diffOnStack
+ CMP diff,#0
+ BLE End
+
+ RSB diff,diff,#0 @ to use VRSHL for right shift by a variable
+ VDUP qShift,diff
+
+ @ Use parallel loads for bigger FFT size.
+ CMP subFFTSize, #8
+ BLT scaleLessFFTData
+
+scaleFFTData:
+ VLD1 {qT0s, qT1s},[pSrc:256] @ pSrc contains pDst pointer
+ SUBS subFFTSize,subFFTSize,#8
+ VSHL qT0s,qShift
+ VSHL qT1s,qShift
+ VST1 {qT0s, qT1s},[pSrc:256]!
+ BGT scaleFFTData
+ B End
+
+scaleLessFFTData: @ N = subFFTSize ; dataptr = pDst ; scale = diff
+ VLD1 {dX0[0]},[pSrc] @ pSrc contains pDst pointer
+ SUBS subFFTSize,subFFTSize,#1
+ VRSHL dX0,dShift
+ VST1 {dX0[0]},[pSrc]!
+ BGT scaleLessFFTData
+
+End:
+ @ Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @ Write function tail
+ M_END
+
+
+
+
+
+
+ .END
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S
index becc0327e7f..f2f2d025d22 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32_Sfs_s.S
index 003d666036d..10ce047dbff 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32_Sfs_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_FC32_Sfs_s.S
index c2e86d2f7e8..73a6549f00c 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_FC32_Sfs_s.S
@@ -20,8 +20,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC16_Sfs_s.S
index ff85e2b5af6..2388d0f5811 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC16_Sfs_s.S
@@ -29,8 +29,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC32_Sfs_s.S
index 09c461cc78f..7df624301c3 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/neon/omxSP_FFTInv_CToC_SC32_Sfs_s.S
@@ -28,8 +28,8 @@
@// Include standard headers
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_FC32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c
index 081f23739dd..6ac9de85a90 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_FC32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC16.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c
index 288c76ca614..1fc4fe2bd6f 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC16.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c
@@ -25,7 +25,7 @@
* Compute the size of the specification structure required
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c
index 0ca3b5664b4..176586407cb 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c
@@ -25,7 +25,7 @@
* Compute the size of the specification structure required
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_F32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_F32.c
index 19b16bbd959..046d069d06e 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_F32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_F32.c
@@ -9,7 +9,7 @@
*
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16.c
new file mode 100644
index 00000000000..7ad27500dc0
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Some code in this file was originally from file omxSP_FFTGetBufSize_R_S32.c
+ * which was licensed as follows.
+ * It has been relicensed with permission from the copyright holders.
+ */
+
+/*
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:
+ * Last Modified Date:
+ */
+
+#include "dl/api/arm/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTGetBufSize_R_S16
+ *
+ * Description:
+ * Computes the size of the specification structure required for the length
+ * 2^order real FFT and IFFT functions.
+ *
+ * Remarks:
+ * This function is used in conjunction with the 16-bit functions
+ * <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>.
+ *
+ * Parameters:
+ * [in] order base-2 logarithm of the length; valid in the range
+ * [1,12].
+ * [out] pSize pointer to the number of bytes required for the
+ * specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_R_S16(OMX_INT order, OMX_INT *pSize) {
+ OMX_INT NBy2,N,twiddleSize;
+
+ /* Order zero not allowed */
+ if (order == 0) {
+ return OMX_Sts_BadArgErr;
+ }
+
+ NBy2 = 1 << (order - 1);
+ N = NBy2 << 1;
+ twiddleSize = 5 * N / 8; /* 3 / 4 (N / 2) + N / 4 */
+
+ /* 2 pointers to store bitreversed array and twiddle factor array */
+ *pSize = sizeof(ARMsFFTSpec_R_SC16)
+ /* Twiddle factors */
+ + sizeof(OMX_SC16) * twiddleSize
+ /* Ping Pong buffer for doing the N/2 point complex FFT; */
+ /* extra size 'N' as a temporary buf for FFTInv_CCSToR_S16_Sfs */
+ + sizeof(OMX_S16) * (N << 1)
+ /* Extra bytes to get 32 byte alignment of ptwiddle and pBuf */
+ + 62 ;
+
+
+ return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ * END OF FILE
+ *****************************************************************************/
+
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S16S32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c
index 846536386d9..6ebdae10c86 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S16S32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c
@@ -25,7 +25,7 @@
* Computes the size of the specification structure required.
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S32.c
index d57294700e8..d5758d0a7ee 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTGetBufSize_R_S32.c
@@ -25,7 +25,7 @@
* Computes the size of the specification structure required.
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_FC32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_FC32.c
index cc53c5912f1..4a68b6f6b76 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_FC32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_FC32.c
@@ -11,7 +11,7 @@
* complex float instead of SC32.
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC16.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC16.c
index f8248bbbf0b..0a23b8b7651 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC16.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC16.c
@@ -25,7 +25,7 @@
* Initializes the specification structures required
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC32.c
index 9ea103f3d68..0b4b5371d5e 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_C_SC32.c
@@ -25,7 +25,7 @@
* Initializes the specification structures required
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_F32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_F32.c
index 32d22230ed7..b5067833517 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_F32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_F32.c
@@ -11,7 +11,7 @@
* instead of S32.
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16.c
new file mode 100644
index 00000000000..e3fc2719e4d
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Some code in this file was originally from file omxSP_FFTInit_R_S16S32.c
+ * which was licensed as follows.
+ * It has been relicensed with permission from the copyright holders.
+ */
+
+/*
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:
+ * Last Modified Date:
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ */
+
+#include "dl/api/arm/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTInit_R_S16
+ *
+ * Description:
+ * Initialize the real forward-FFT specification information struct.
+ *
+ * Remarks:
+ * This function is used to initialize the specification structures
+ * for functions <ippsFFTFwd_RToCCS_S16_Sfs> and
+ * <ippsFFTInv_CCSToR_S16_Sfs>. Memory for *pFFTSpec must be
+ * allocated prior to calling this function. The number of bytes
+ * required for *pFFTSpec can be determined using
+ * <FFTGetBufSize_R_S16>.
+ *
+ * Parameters:
+ * [in] order base-2 logarithm of the desired block length;
+ * valid in the range [1,12].
+ * [out] pFFTFwdSpec pointer to the initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_R_S16(OMXFFTSpec_R_S16* pFFTSpec, OMX_INT order) {
+ OMX_INT i = 0, j = 0;
+ OMX_SC16 *pTwiddle = NULL, *pTwiddle1 = NULL, *pTwiddle2 = NULL;
+ OMX_SC16 *pTwiddle3 = NULL, *pTwiddle4 = NULL;
+ OMX_S16 *pBuf = NULL;
+ OMX_U16 *pBitRev = NULL;
+ OMX_U32 pTmp = 0;
+ OMX_INT Nby2 = 0, N = 0, M = 0, diff = 0, step = 0;
+ OMX_S16 x = 0, y = 0, xNeg = 0;
+ OMX_S32 xS32 = 0, yS32 = 0;
+ ARMsFFTSpec_R_SC16 *pFFTStruct = NULL;
+
+ /* Order zero not allowed */
+ if (order == 0) {
+ return OMX_Sts_BadArgErr;
+ }
+
+ /* Do the initializations */
+ pFFTStruct = (ARMsFFTSpec_R_SC16*) pFFTSpec;
+ Nby2 = 1 << (order - 1);
+ N = Nby2 << 1;
+ pBitRev = NULL ; /* optimized implementations don't use bitreversal */
+ pTwiddle = (OMX_SC16*) (sizeof(ARMsFFTSpec_R_SC16) + (OMX_S8*)pFFTSpec);
+
+ /* Align to 32 byte boundary */
+ pTmp = ((OMX_U32)pTwiddle)&31; /* (OMX_U32)pTwiddle % 32 */
+ if(pTmp != 0) {
+ pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32 - pTmp));
+ }
+
+ pBuf = (OMX_S16*) (sizeof(OMX_SC16) * (5 * N / 8) + (OMX_S8*)pTwiddle);
+
+ /* Align to 32 byte boundary */
+ pTmp = ((OMX_U32)pBuf)&31; /* (OMX_U32)pBuf % 32 */
+ if(pTmp != 0) {
+ pBuf = (OMX_S16*)((OMX_S8*)pBuf + (32 - pTmp));
+ }
+
+ /*
+ * Filling Twiddle factors : exp^(-j*2*PI*k/ (N/2) ) ; k=0,1,2,...,3/4(N/2).
+ * N/2 point complex FFT is used to compute N point real FFT.
+ * The original twiddle table "armSP_FFT_S32TwiddleTable" is of size
+ * (MaxSize/8 + 1). Rest of the values i.e., up to MaxSize are calculated
+ * using the symmetries of sin and cos.
+ * The max size of the twiddle table needed is 3/4(N/2) for a radix-4 stage.
+ *
+ * W = (-2 * PI) / N
+ * N = 1 << order
+ * W = -PI >> (order - 1)
+ *
+ * Note we use S32 twiddle factor table and round the values to 16 bits.
+ */
+
+ M = Nby2 >> 3;
+ diff = 12 - (order - 1);
+ step = 1 << diff; /* Step into the twiddle table for the current order */
+
+ xS32 = armSP_FFT_S32TwiddleTable[0];
+ yS32 = armSP_FFT_S32TwiddleTable[1];
+ x = (xS32 + 0x8000) >> 16;
+ y = (yS32 + 0x8000) >> 16;
+ xNeg = 0x7FFF;
+
+ if((order-1) >= 3) {
+ /* i = 0 case */
+ pTwiddle[0].Re = x;
+ pTwiddle[0].Im = y;
+ pTwiddle[2 * M].Re = -y;
+ pTwiddle[2 * M].Im = xNeg;
+ pTwiddle[4 * M].Re = xNeg;
+ pTwiddle[4 * M].Im = y;
+
+ for (i=1; i<=M; i++){
+ OMX_S16 x_neg = 0, y_neg = 0;
+ j = i * step;
+
+ xS32 = armSP_FFT_S32TwiddleTable[2 * j];
+ yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1];
+ x = (xS32 + 0x8000) >> 16;
+ y = (yS32 + 0x8000) >> 16;
+ /* |x_neg = -x| doesn't work when x is 0x8000. */
+ x_neg = (-(xS32 + 0x8000)) >> 16;
+ y_neg = (-(yS32 + 0x8000)) >> 16;
+
+ pTwiddle[i].Re = x;
+ pTwiddle[i].Im = y;
+ pTwiddle[2 * M - i].Re = y_neg;
+ pTwiddle[2 * M - i].Im = x_neg;
+ pTwiddle[2 * M + i].Re = y;
+ pTwiddle[2 * M + i].Im = x_neg;
+ pTwiddle[4 * M - i].Re = x_neg;
+ pTwiddle[4 * M - i].Im = y;
+ pTwiddle[4 * M + i].Re = x_neg;
+ pTwiddle[4 * M + i].Im = y_neg;
+ pTwiddle[6 * M - i].Re = y;
+ pTwiddle[6 * M - i].Im = x;
+ }
+ }
+ else {
+ if ((order - 1) == 2) {
+ pTwiddle[0].Re = x;
+ pTwiddle[0].Im = y;
+ pTwiddle[1].Re = -y;
+ pTwiddle[1].Im = xNeg;
+ pTwiddle[2].Re = xNeg;
+ pTwiddle[2].Im = y;
+ }
+ if ((order-1) == 1) {
+ pTwiddle[0].Re = x;
+ pTwiddle[0].Im = y;
+ }
+ }
+
+ /*
+ * Now fill the last N/4 values : exp^(-j*2*PI*k/N); k=1,3,5,...,N/2-1.
+ * These are used for the final twiddle fix-up for converting complex to
+ * real FFT.
+ */
+
+ M = N >> 3;
+ diff = 12 - order;
+ step = 1 << diff;
+
+ pTwiddle1 = pTwiddle + 3 * N / 8;
+ pTwiddle4 = pTwiddle1 + (N / 4 - 1);
+ pTwiddle3 = pTwiddle1 + N / 8;
+ pTwiddle2 = pTwiddle1 + (N / 8 - 1);
+
+ xS32 = armSP_FFT_S32TwiddleTable[0];
+ yS32 = armSP_FFT_S32TwiddleTable[1];
+ x = (xS32 + 0x8000) >> 16;
+ y = (yS32 + 0x8000) >> 16;
+ xNeg = 0x7FFF;
+
+ if((order) >= 3) {
+ for (i = 1; i <= M; i += 2 ) {
+ OMX_S16 x_neg = 0, y_neg = 0;
+
+ j = i*step;
+
+ xS32 = armSP_FFT_S32TwiddleTable[2 * j];
+ yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1];
+ x = (xS32 + 0x8000) >> 16;
+ y = (yS32 + 0x8000) >> 16;
+ /* |x_neg = -x| doesn't work when x is 0x8000. */
+ x_neg = (-(xS32 + 0x8000)) >> 16;
+ y_neg = (-(yS32 + 0x8000)) >> 16;
+
+ pTwiddle1[0].Re = x;
+ pTwiddle1[0].Im = y;
+ pTwiddle1 += 1;
+ pTwiddle2[0].Re = y_neg;
+ pTwiddle2[0].Im = x_neg;
+ pTwiddle2 -= 1;
+ pTwiddle3[0].Re = y;
+ pTwiddle3[0].Im = x_neg;
+ pTwiddle3 += 1;
+ pTwiddle4[0].Re = x_neg;
+ pTwiddle4[0].Im = y;
+ pTwiddle4 -= 1;
+ }
+ }
+ else {
+ if (order == 2) {
+ pTwiddle1[0].Re = -y;
+ pTwiddle1[0].Im = xNeg;
+ }
+ }
+
+ /* Update the structure */
+ pFFTStruct->N = N;
+ pFFTStruct->pTwiddle = pTwiddle;
+ pFFTStruct->pBitRev = pBitRev;
+ pFFTStruct->pBuf = pBuf;
+
+ return OMX_Sts_NoErr;
+}
+/*****************************************************************************
+ * END OF FILE
+ *****************************************************************************/
+
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S16S32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16S32.c
index d157b3457c4..9a66430c2df 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S16S32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S16S32.c
@@ -25,7 +25,7 @@
* Initialize the real forward-FFT specification information struct.
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S32.c b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S32.c
index 337f2a20b28..d55ab065095 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S32.c
+++ b/chromium/third_party/openmax_dl/dl/sp/src/arm/omxSP_FFTInit_R_S32.c
@@ -25,7 +25,7 @@
* Initialize the real forward-FFT specification information struct.
*/
-#include "dl/api/armOMX.h"
+#include "dl/api/arm/armOMX.h"
#include "dl/api/omxtypes.h"
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/test/test_fft.gyp b/chromium/third_party/openmax_dl/dl/sp/src/test/test_fft.gyp
index 99b3774324f..99280b59c2d 100644
--- a/chromium/third_party/openmax_dl/dl/sp/src/test/test_fft.gyp
+++ b/chromium/third_party/openmax_dl/dl/sp/src/test/test_fft.gyp
@@ -17,7 +17,7 @@
],
'dependencies' : [
'../../../dl.gyp:openmax_dl',
- 'test_utilities'
+ 'test_utilities',
],
'conditions': [
['big_float_fft == 1', {
@@ -27,7 +27,110 @@
}],
],
},
+ 'conditions': [
+ ['target_arch == "arm"', {
+ # Test programs supported on ARM
+ 'targets': [
+ {
+ # Test complex fixed-point 16-bit FFT
+ 'target_name': 'test_fft16',
+ 'type': 'executable',
+ 'sources': [
+ 'test_fft16.c',
+ ],
+ },
+ {
+ # Test complex fixed-point 32-bit FFT
+ 'target_name': 'test_fft32',
+ 'type': 'executable',
+ 'sources': [
+ 'test_fft32.c',
+ ],
+ },
+ {
+ # Test real 32-bit fixed-point FFT
+ 'target_name': 'test_rfft32',
+ 'type': 'executable',
+ 'sources': [
+ 'test_rfft32.c',
+ ],
+ },
+ {
+ # Test real 16-bit fixed-point FFT implemented with S32 routines.
+ 'target_name': 'test_rfft16_s32',
+ 'type': 'executable',
+ 'sources': [
+ 'test_rfft16_s32.c',
+ ],
+ },
+ {
+ # Test real 16-bit fixed-point FFT implemented with S16 routines.
+ 'target_name': 'test_rfft16_s16',
+ 'type': 'executable',
+ 'sources': [
+ 'test_rfft16_s16.c',
+ ],
+ },
+ {
+ # Test complex floating-point FFT
+ 'target_name': 'test_float_fft',
+ 'type': 'executable',
+ 'sources': [
+ 'test_float_fft.c',
+ 'support/float_fft_neon.c',
+ ],
+ },
+ # Non-NEON test programs
+ {
+ # Test complex floating-point FFT, non-NEON
+ 'target_name': 'test_float_fft_armv7',
+ 'type': 'executable',
+ 'defines': [
+ 'ARM_VFP_TEST'
+ ],
+ 'sources': [
+ 'test_float_fft.c',
+ 'support/float_fft_armv7.c',
+ ],
+ },
+ {
+ # Test real floating-point FFT, non-NEON
+ 'target_name': 'test_float_rfft_armv7',
+ 'type': 'executable',
+ 'sources': [
+ 'test_float_rfft.c',
+ 'support/float_rfft_armv7.c',
+ 'support/float_rfft_thresholds.h',
+ ],
+ },
+ {
+ # Test real floating-point FFT, detecting NEON support
+ 'target_name': 'test_float_rfft_detect',
+ 'type': 'executable',
+ 'sources': [
+ 'test_float_rfft.c',
+ 'support/float_rfft_detect.c',
+ 'support/float_rfft_thresholds.h',
+ ],
+ },
+ {
+ # Simple timing test of FFTs, non-NEON
+ 'target_name': 'test_fft_time_armv7',
+ 'type': 'executable',
+ 'defines': [
+ # Timing test for non-NEON is only supported for float FFTs.
+ 'ARM_VFP_TEST',
+ 'FLOAT_ONLY',
+ ],
+ 'sources': [
+ 'test_fft_time.c',
+ ],
+ },
+ ],
+ }],
+ ],
'targets': [
+ # Targets that should be supported by all architectures
{
# Test utilities
'target_name': 'test_utilities',
@@ -43,51 +146,24 @@
],
},
{
- # Test complex fixed-point 16-bit FFT
- 'target_name': 'test_fft16',
- 'type': 'executable',
- 'sources': [
- 'test_fft16.c',
- ],
- },
- {
- # Test complex fixed-point 32-bit FFT
- 'target_name': 'test_fft32',
- 'type': 'executable',
- 'sources': [
- 'test_fft32.c',
- ],
- },
- {
- # Test real 32-bit fixed-point FFT
- 'target_name': 'test_rfft32',
- 'type': 'executable',
- 'sources': [
- 'test_rfft32.c',
- ],
- },
- {
- # Test real 16-bit fixed-point FFT
- 'target_name': 'test_rfft16',
- 'type': 'executable',
- 'sources': [
- 'test_rfft16.c',
- ],
- },
- {
- # Test complex floating-point FFT
- 'target_name': 'test_float_fft',
- 'type': 'executable',
- 'sources': [
- 'test_float_fft.c',
- ],
- },
- {
# Test real floating-point FFT
'target_name': 'test_float_rfft',
'type': 'executable',
'sources': [
'test_float_rfft.c',
+ 'support/float_rfft_thresholds.h',
+ ],
+ 'conditions': [
+ ['target_arch == "arm"', {
+ 'sources': [
+ 'support/float_rfft_neon.c',
+ ],
+ }],
+ ['target_arch == "ia32"', {
+ 'sources': [
+ 'support/float_rfft_x86.c',
+ ],
+ }],
],
},
{
@@ -97,18 +173,42 @@
'sources': [
'test_fft_time.c',
],
+ 'conditions': [
+ ['target_arch == "ia32"', {
+ 'defines': [
+ # Timing test only for float FFTs on x86
+ 'FLOAT_ONLY',
+ ],
+ }],
+ ],
},
{
# Build all test programs.
'target_name': 'All',
'type': 'none',
- 'dependencies': [
- 'test_fft16',
- 'test_fft32',
- 'test_float_fft',
+ 'conditions' : [
+ ['target_arch == "arm"', {
+ # Supported test programs for ARM
+ 'dependencies': [
+ 'test_fft16',
+ 'test_fft32',
+ 'test_float_fft',
+ 'test_float_rfft',
+ 'test_rfft16_s32',
+ 'test_rfft16_s16',
+ 'test_rfft32',
+ # Non-Neon tests
+ 'test_fft_time_armv7',
+ 'test_float_fft_armv7',
+ 'test_float_rfft_armv7',
+ # Tests with detection
+ 'test_float_rfft_detect',
+ ],
+ }],
+ ],
+ 'dependencies' : [
+ # All architectures must support at least the float rfft test
'test_float_rfft',
- 'test_rfft16',
- 'test_rfft32',
'test_fft_time',
],
},
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c
new file mode 100644
index 00000000000..b6d1c98279d
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/api/x86SP.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+#include <stdbool.h>
+
+extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft);
+
+extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft);
+
+/**
+ * A two-for-one algorithm is used here to do the real fft:
+ *
+ * Input x[n], (n = 0, ..., N - 1)
+ * Output X[k] = DFT(N, k){x}
+ * a[n] = x[2n], (n = 0, ..., N/2 - 1)
+ * b[n] = x[2n + 1], (n = 0, ..., N/2 - 1)
+ * z[n] = a[n] + j * b[n]
+ * Z[k] = DFT(N/2, k){z}
+ * Z' is the complex conjugate of Z
+ * A[k] = (Z[k] + Z'[N/2 - k]) / 2
+ * B[k] = -j * (Z[k] - Z'[N/2 - k]) / 2
+ * X[k] = A[k] + B[k] * W[k], (W = exp(-j*2*PI*k/N); k = 0, ..., N/2 - 1)
+ * X[k] = A[k] - B[k], (k = N/2)
+ * X' is complex conjugate of X
+ * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
+ */
+
+/**
+ * This function is the last permutation of two-for-one FFT algorithm.
+ * We move the division by 2 to the last step in the implementation, so:
+ * A[k] = (Z[k] + Z'[N/2 - k])
+ * B[k] = -j * (Z[k] - Z'[N/2 - k])
+ * X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
+ * X[k] = (A[k] - B[k]), (k = N/2)
+ * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
+ */
+static void RevbinPermuteFwd(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+
+ OMX_FC32 big_a;
+ OMX_FC32 big_b;
+ OMX_FC32 temp;
+ const OMX_F32 *tw;
+
+ for (i = 1, j = n_by_2 - 1; i < n_by_4; i++, j--) {
+ // A[k] = (Z[k] + Z'[N/2 - k])
+ big_a.Re = in[i] + in[j];
+ big_a.Im = in[j + n_by_2] - in[i + n_by_2];
+
+ // B[k] = -j * (Z[k] - Z'[N/2 - k])
+ big_b.Re = in[j] - in[i];
+ big_b.Im = in[j + n_by_2] + in[i + n_by_2];
+
+ // W[k]
+ tw = twiddle + i;
+
+ // temp = B[k] * W[k]
+ temp.Re = big_b.Re * tw[0] + big_b.Im * tw[n];
+ temp.Im = big_b.Re * tw[n] - big_b.Im * tw[0];
+
+ // Convert split format to interleaved format.
+ // X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
+ out[i << 1] = 0.5f * (big_a.Re - temp.Im);
+ out[(i << 1) + 1] = 0.5f * (temp.Re - big_a.Im);
+ // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
+ out[j << 1] = 0.5f * (big_a.Re + temp.Im);
+ out[(j << 1) + 1] = 0.5f * (temp.Re + big_a.Im);
+ }
+
+ // X[k] = A[k] - B[k] (k = N/2)
+ out[n_by_2] = in[n_by_4];
+ out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
+
+ out[0] = in[0] + in[n_by_2];
+ out[1] = 0;
+ out[n] = in[0] - in[n_by_2];
+ out[n + 1] = 0;
+}
+
+// Sse version of RevbinPermuteFwd function.
+static void RevbinPermuteFwdSse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+
+ VC v_i;
+ VC v_j;
+ VC v_big_a;
+ VC v_big_b;
+ VC v_temp;
+ VC v_x0;
+ VC v_x1;
+ VC v_tw;
+
+ __m128 factor = _mm_set1_ps(0.5f);
+
+ for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
+ VC_LOAD_SPLIT(&v_i, (in + i), n_by_2);
+
+ VC_LOADU_SPLIT(&v_j, (in + j), n_by_2);
+ VC_REVERSE(&v_j);
+
+ // A[k] = (Z[k] + Z'[N/2 - k])
+ VC_ADD_SUB(&v_big_a, &v_j, &v_i);
+
+ // B[k] = -j * (Z[k] - Z'[N/2 - k])
+ VC_SUB_ADD(&v_big_b, &v_j, &v_i);
+
+ // W[k]
+ VC_LOAD_SPLIT(&v_tw, (twiddle + i), n);
+
+ // temp = B[k] * W[k]
+ VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw);
+
+ VC_SUB_X(&v_x0, &v_big_a, &v_temp);
+ VC_ADD_X(&v_x1, &v_big_a, &v_temp);
+
+ VC_MUL_F(&v_x0, &v_x0, factor);
+ VC_MUL_F(&v_x1, &v_x1, factor);
+
+ // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1)
+ VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0);
+
+ // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
+ VC_REVERSE(&v_x1);
+ VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1);
+ }
+
+ out[n_by_2] = in[n_by_4];
+ out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
+
+ out[0] = in[0] + in[n_by_2];
+ out[1] = 0;
+ out[n] = in[0] - in[n_by_2];
+ out[n + 1] = 0;
+}
+
+OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
+ const OMXFFTSpec_R_F32 *pFFTSpec) {
+ // Input must be 32 byte aligned
+ if (!pSrc || !pDst || (OMX_INT)pSrc & 31 || (OMX_INT)pDst & 31)
+ return OMX_Sts_BadArgErr;
+
+ OMX_INT n;
+ OMX_INT n_by_2;
+ OMX_INT n_by_4;
+ const OMX_F32 *twiddle;
+ OMX_F32 *buf;
+
+ const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;
+
+ n = pFFTStruct->N;
+
+ // This is to handle the case of order == 1.
+ if (n == 2) {
+ pDst[0] = (pSrc[0] + pSrc[1]);
+ pDst[1] = 0.0f;
+ pDst[2] = (pSrc[0] - pSrc[1]);
+ pDst[3] = 0.0f;
+ return OMX_Sts_NoErr;
+ }
+
+ n_by_2 = n >> 1;
+ n_by_4 = n >> 2;
+ buf = pFFTStruct->pBuf1;
+ twiddle = pFFTStruct->pTwiddle;
+
+ if(n_by_2 >= 16) {
+ buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ pSrc,
+ pFFTStruct->pBuf2,
+ buf,
+ twiddle,
+ n_by_2,
+ 1);
+ } else {
+ buf = x86SP_F32_radix2_kernel_OutOfPlace(
+ pSrc,
+ pFFTStruct->pBuf2,
+ buf,
+ twiddle,
+ n_by_2,
+ 1);
+ }
+
+ if(n >= 8)
+ RevbinPermuteFwdSse(buf, pDst, twiddle, n);
+ else
+ RevbinPermuteFwd(buf, pDst, twiddle, n);
+
+ return OMX_Sts_NoErr;
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c
new file mode 100644
index 00000000000..f686a7f2f58
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/x86SP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTGetBufSize_R_F32
+ *
+ * Description:
+ * Computes the size of the specification structure required for the length
+ * 2^order real FFT and IFFT functions.
+ *
+ * Remarks:
+ * This function is used in conjunction with the 32-bit functions
+ * <FFTFwd_RToCCS_F32_Sfs> and <FFTInv_CCSToR_F32_Sfs>.
+ *
+ * Parameters:
+ * [in] order base-2 logarithm of the length; valid in the range
+ * [1,12]. ([1,15] if BIG_FFT_TABLE is defined.)
+ * [out] pSize pointer to the number of bytes required for the
+ * specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_R_F32(OMX_INT order, OMX_INT *pSize) {
+ if (!pSize || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+ return OMX_Sts_BadArgErr;
+
+ OMX_INT n_by_2;
+ OMX_INT n;
+
+ n_by_2 = 1 << (order - 1);
+ n = n_by_2 << 1;
+
+ *pSize = sizeof(X86FFTSpec_R_FC32) +
+ // Twiddle factors.
+ sizeof(OMX_F32) * (n << 1) +
+ // Ping Pong buffer for doing the n/2 point complex FFT.
+ // pBuf1
+ sizeof(OMX_F32) * n + 4 +
+ // pBuf2
+ sizeof(OMX_F32) * n + 4 +
+ // Extra bytes to get 32 byte alignment of ptwiddle, pBuf1
+ 62;
+
+ return OMX_Sts_NoErr;
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInit_R_F32.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInit_R_F32.c
new file mode 100644
index 00000000000..564f1666274
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInit_R_F32.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * This is a modification of omxSP_FFTInit_R_S32.c to support float
+ * instead of S32.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/api/x86SP.h"
+
+/**
+ * Function: omxSP_FFTInit_R_F32
+ *
+ * Description:
+ * Initialize the real forward-FFT specification information struct.
+ *
+ * Remarks:
+ * This function is used to initialize the specification structures
+ * for functions |omxSP_FFTFwd_RToCCS_F32_Sfs| and
+ * |omxSP_FFTInv_CCSToR_F32_Sfs|. Memory for *pFFTSpec must be
+ * allocated prior to calling this function. The number of bytes
+ * required for *pFFTSpec can be determined using
+ * |omxSP_FFTGetBufSize_R_F32|.
+ *
+ * Parameters:
+ * [in] order base-2 logarithm of the desired block length;
+ * valid in the range [1,12]. ([1,15] if
+ * BIG_FFT_TABLE is defined.)
+ * [out] pFFTFwdSpec pointer to the initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_R_F32(OMXFFTSpec_R_F32 *pFFTSpec, OMX_INT order)
+{
+ OMX_F32 *pTwiddle;
+ OMX_F32 *pBuf;
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT N;
+ OMX_INT NBy2;
+ OMX_INT NBy4;
+ OMX_INT diff;
+ OMX_U32 pTmp;
+ X86FFTSpec_R_FC32 *pFFTStruct = (X86FFTSpec_R_FC32 *) pFFTSpec;
+ OMX_F32 real;
+ OMX_F32 imag;
+
+ if (!pFFTSpec || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+ return OMX_Sts_BadArgErr;
+
+ N = 1 << order;
+ NBy2 = N >> 1;
+
+ pTwiddle = (OMX_F32*) (sizeof(X86FFTSpec_R_FC32) + (OMX_S8*) pFFTSpec);
+
+ // Align to 32 byte boundary.
+ pTmp = ((OMX_U32)pTwiddle) & 31;
+ if (pTmp)
+ pTwiddle = (OMX_F32*) ((OMX_S8*)pTwiddle + (32 - pTmp));
+
+ pBuf = (OMX_F32*) (sizeof(OMX_F32) * (N << 1) + (OMX_S8*) pTwiddle);
+
+ // Align to 32 byte boundary.
+ pTmp = ((OMX_U32)pBuf) & 31;
+ if (pTmp)
+ pBuf = (OMX_F32*) ((OMX_S8*)pBuf + (32 - pTmp));
+
+ // Calculating Twiddle Factors.
+ diff = 1 << (TWIDDLE_TABLE_ORDER - order + 1);
+
+ // For SSE optimization, using twiddle with split format by which the real and
+ // imag data are stored into first and last halves of the buffer separately
+ // The negatives are moved when generating pTwiddle table.
+ if (order > 1) {
+ NBy4 = N >> 2;
+ for (i = 0, j = 0; i <= NBy4 >> 1; ++i, j += diff) {
+ real = armSP_FFT_F32TwiddleTable[j];
+ imag = armSP_FFT_F32TwiddleTable[j + 1];
+
+ pTwiddle[i] = -real;
+ pTwiddle[i + N] = -imag;
+
+ pTwiddle[NBy4 - i] = imag;
+ pTwiddle[NBy4 - i + N] = real;
+
+ pTwiddle[NBy4 + i] = -imag;
+ pTwiddle[NBy4 + i + N] = real;
+
+ pTwiddle[NBy2 - i] = real;
+ pTwiddle[NBy2 - i + N] = -imag;
+
+ pTwiddle[NBy2 + i] = real;
+ pTwiddle[NBy2 + i + N] = imag;
+
+ pTwiddle[NBy4 * 3 - i] = -imag;
+ pTwiddle[NBy4 * 3 - i + N] = -real;
+
+ pTwiddle[NBy4 * 3 + i] = imag;
+ pTwiddle[NBy4 * 3 + i + N] = -real;
+
+ pTwiddle[N - i - 1] = -real;
+ pTwiddle[(N << 1) - i - 1] = imag;
+ }
+ } else {
+ pTwiddle[0] = armSP_FFT_F32TwiddleTable[0];
+ pTwiddle[2] = armSP_FFT_F32TwiddleTable[1];
+ pTwiddle[1] = -pTwiddle[0];
+ pTwiddle[3] = pTwiddle[2];
+ }
+ pFFTStruct->N = N;
+ pFFTStruct->pTwiddle = pTwiddle;
+ pFFTStruct->pBuf1 = pBuf;
+ pFFTStruct->pBuf2 = pBuf + N + 4;
+
+ return OMX_Sts_NoErr;
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c
new file mode 100644
index 00000000000..1733d665288
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/api/x86SP.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+#include <stdbool.h>
+
+extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft);
+
+extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft);
+
+/**
+ * A two-for-one algorithm is used here to do the real ifft:
+ *
+ * Input X[k], (k = 0, ..., N - 1)
+ * Output x[n] = IDFT(N, k){X}
+ * X' is complex conjugate of X
+ * A[k] = (X[k] + X'[N/2 - k]) / 2
+ * B[k] = (X[k] - X'[N/2 - k]) / 2 * W[k], (W = exp(j*2*PI*k/N);
+ * k = 0, ..., N/2 - 1)
+ * Z[k] = A[k] + j * B[k], (k = 0, ..., N/2 - 1)
+ * z[n] = IDFT(N/2, k){Z}
+ * x[2n] = Re(z[n]), (n = 0, ..., N/2 - 1)
+ * x[2n + 1] = Im(z[n]), (n = 0, ..., N/2 - 1)
+ */
+
+/**
+ * This function is the first permutation of two-for-one IFFT algorithm.
+ * We move the division by 2 to the last step in the implementation, so:
+ * A[k] = (X[k] + X'[N/2 - k])
+ * B[k] = (X[k] - X'[N/2 - k]) * W[k], (k = 0, ..., N/2 - 1)
+ * Z[k] = (A[k] + j * B[k]) / 2, (k = 0, ..., N/2 - 1)
+ */
+static void RevbinPermuteInv(const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT i_by_2;
+ OMX_INT j_by_2;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+
+ OMX_FC32 big_a;
+ OMX_FC32 big_b;
+ OMX_FC32 temp;
+ const OMX_F32 *tw;
+
+ for (i = 2, j = n - 2; i < n_by_2; i += 2, j -= 2) {
+ // A[k] = (X[k] + X'[N/2 - k])
+ big_a.Re = in[i] + in[j];
+ big_a.Im = in[i + 1] - in[j + 1];
+
+ // temp = (X[k] - X'[N/2 - k])
+ temp.Re = in[i] - in[j];
+ temp.Im = in[i + 1] + in[j + 1];
+
+ i_by_2 = i >> 1;
+ j_by_2 = j >> 1;
+
+ // W[k]
+ tw = twiddle + i_by_2;
+
+ // B[k] = (X[k] - X'[N/2 - k]) * W[k]
+ big_b.Re = temp.Re * tw[0] + temp.Im * tw[n];
+ big_b.Im = temp.Re * tw[n] - temp.Im * tw[0];
+
+ // Convert split format to interleaved format.
+ // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1)
+ // The scaling of 1/2 will be merged into to the scaling in
+ // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs.
+ out[i_by_2] = big_a.Re + big_b.Im;
+ out[i_by_2 + n_by_2] = big_b.Re + big_a.Im;
+ out[j_by_2] = big_a.Re - big_b.Im;
+ out[j_by_2 + n_by_2] = big_b.Re - big_a.Im;
+ }
+
+ // The n_by_2 complex point
+ out[n_by_4] = 2.0f * in[n_by_2];
+ out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1];
+
+ // The first complex point
+ out[0] = in[0] + in[n];
+ out[n_by_2] = in[0] - in[n];
+}
+
+// Sse version of RevbinPermuteInv function.
+static void RevbinPermuteInvSse(const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ const OMX_F32 *tw;
+ const OMX_F32 *pi;
+ const OMX_F32 *pj;
+
+ VC v_i;
+ VC v_j;
+ VC v_big_a;
+ VC v_big_b;
+ VC v_temp;
+ VC v_tw;
+
+ for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
+ pi = in + (i << 1);
+ pj = in + (j << 1);
+ VC_LOAD_INTERLEAVE(&v_i, pi);
+
+ v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]);
+ v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]);
+
+ // A[k] = (X[k] + X'[N/2 - k])
+ VC_ADD_SUB(&v_big_a, &v_i, &v_j);
+
+ // temp = (X[k] - X'[N/2 - k])
+ VC_SUB_ADD(&v_temp, &v_i, &v_j);
+
+ // W[k]
+ tw = twiddle + i;
+ VC_LOAD_SPLIT(&v_tw, tw, n);
+
+ // B[k] = (X[k] - X'[N/2 - k]) * W[k]
+ VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw);
+
+ // Convert split format to interleaved format.
+ // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1)
+ // The scaling of 1/2 will be merged into to the scaling in
+ // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs.
+ VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2);
+
+ VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2);
+ }
+
+ // The n_by_2 complex point
+ out[n_by_4] = 2.0f * in[n_by_2];
+ out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1];
+
+ // The first complex point
+ out[0] = in[0] + in[n];
+ out[n_by_2] = in[0] - in[n];
+}
+
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
+ const OMXFFTSpec_R_F32 *pFFTSpec) {
+ // Input must be 32 byte aligned
+ if (!pSrc || !pDst || (OMX_INT)pSrc & 31 || (OMX_INT)pDst & 31)
+ return OMX_Sts_BadArgErr;
+
+ OMX_INT n;
+ OMX_INT n_by_2;
+ OMX_INT n_by_4;
+ OMX_INT i;
+ const OMX_F32 *twiddle;
+ OMX_F32 *buf;
+ OMX_F32 *in = (OMX_F32*) pSrc;
+
+ const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;
+
+ n = pFFTStruct->N;
+
+ // This is to handle the case of order == 1.
+ if (n == 2) {
+ pDst[0] = (pSrc[0] + pSrc[2]) / 2;
+ pDst[1] = (pSrc[0] - pSrc[2]) / 2;
+ return OMX_Sts_NoErr;
+ }
+
+ n_by_2 = n >> 1;
+ n_by_4 = n >> 2;
+ buf = pFFTStruct->pBuf1;
+
+ twiddle = pFFTStruct->pTwiddle;
+
+ if (n < 8)
+ RevbinPermuteInv(in, buf, twiddle, n);
+ else
+ RevbinPermuteInvSse(in, buf, twiddle, n);
+
+ if (n_by_2 < 16) {
+ buf = x86SP_F32_radix2_kernel_OutOfPlace(
+ buf,
+ pFFTStruct->pBuf2,
+ buf,
+ twiddle,
+ n_by_2,
+ 0);
+ } else {
+ buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ buf,
+ pFFTStruct->pBuf2,
+ buf,
+ twiddle,
+ n_by_2,
+ 0);
+ }
+
+ // Scale the result by 1/n.
+ // It contains a scaling factor of 1/2 in
+ // RevbinPermuteInv/RevbinPermuteInvSse.
+ OMX_F32 factor = 1.0f / n;
+
+ if (n < 8) {
+ for (i = 0; i < n_by_2; i++) {
+ pDst[i << 1] = buf[i] * factor;
+ pDst[(i << 1) + 1] = buf[i + n_by_2] * factor;
+ }
+ } else {
+ OMX_F32 *base;
+ OMX_F32 *dst;
+ VC temp0;
+ VC temp1;
+ __m128 mFactor = _mm_load1_ps(&factor);
+
+ // Two things are done in this loop:
+ // 1 Get the result scaled; 2 Change the format from split to interleaved.
+ for (i = 0; i < n_by_2; i += 4) {
+ base = buf + i;
+ dst = pDst + (i << 1);
+ VC_LOAD_SPLIT(&temp0, base, n_by_2);
+ VC_MUL_F(&temp1, &temp0, mFactor);
+ VC_STORE_INTERLEAVE(dst, &temp1);
+ }
+ }
+
+ return OMX_Sts_NoErr;
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c
new file mode 100644
index 00000000000..6fa21cfb40d
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n; i += 2) {
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CADD out0, in0, in1
+ out0[0] = in0[0] + in1[0];
+ out0[n] = in0[1] + in1[1];
+
+ // CSUB out1, in0, in1
+ out1[0] = in0[0] - in1[0];
+ out1[n] = in0[1] - in1[1];
+
+ out0 += 1;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c
new file mode 100644
index 00000000000..f4d991c85c3
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n; i += 2) {
+ OMX_FC32 t;
+ const OMX_F32 *tw = twiddle + i;
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + 1;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CMUL t, tw, in1
+ t.Re = tw[0] * in1[0] - tw[n << 1] * in1[n];
+ t.Im = tw[0] * in1[n] + tw[n << 1] * in1[0];
+
+ // CADD out0, in0, t
+ out0[0] = in0[0] + t.Re;
+ out0[n] = in0[n] + t.Im;
+
+ // CSUB out1, in0, t
+ out1[0] = in0[0] - t.Re;
+ out1[n] = in0[n] - t.Im;
+
+ out0 += 1;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c
new file mode 100644
index 00000000000..a712d96e4b3
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_F32 *out0 = out;
+ OMX_INT i;
+
+ // This function is used when n >= 8
+ assert(n >= 8);
+ if (n < 8) return;
+
+ for (i = 0; i < n; i += 8) {
+ VC v_tw;
+ VC v_t0;
+ VC v_t1;
+ VC v_temp;
+
+ // Load twiddle
+ const OMX_F32 *tw = twiddle + i;
+ v_tw.real = _mm_set_ps(tw[6], tw[4], tw[2], tw[0]);
+ const OMX_F32 * twi = tw + (n << 1);
+ v_tw.imag = _mm_set_ps(twi[6], twi[4], twi[2], twi[0]);
+
+ // Load real part
+ const OMX_F32 *t = in + i;
+ VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t1.real), t);
+
+ // Load imag part
+ t = t + n;
+ VC_LOAD_SHUFFLE(&(v_t0.imag), &(v_t1.imag), t);
+
+ OMX_F32 *out1 = out0 + (n >> 1);
+ VC_MUL(&v_temp, &v_tw, &v_t1);
+
+ VC_SUB_STORE_SPLIT(out1, &v_t0, &v_temp, n);
+
+ VC_ADD_STORE_SPLIT(out0, &v_t0, &v_temp, n);
+
+ out0 += 4;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c
new file mode 100644
index 00000000000..37148775e25
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT grp;
+ OMX_F32 *out0 = out;
+ OMX_INT set_count = sub_num >> 1;
+
+ for (grp = 0; grp < sub_size; ++grp) {
+ OMX_INT set;
+ const OMX_F32 *tw = twiddle + grp * sub_num;
+
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t;
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CMUL t, tw, in1
+ t.Re = tw[0] * in1[0] - tw[n << 1] * in1[n];
+ t.Im = tw[0] * in1[n] + tw[n << 1] * in1[0];
+
+ // CADD out0, in0, t
+ out0[0] = in0[0] + t.Re;
+ out0[n] = in0[n] + t.Im;
+
+ // CSUB out1, in0, t
+ out1[0] = in0[0] - t.Re;
+ out1[n] = in0[n] - t.Im;
+
+ out0 += 1;
+ }
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c
new file mode 100644
index 00000000000..36a40d8a910
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_4 = n >> 2;
+
+ // Transform from interleaved format to split format.
+ for (i = 0; i < n; i++) {
+ out[i] = in[i << 1];
+ out[i + n] = in[(i << 1) + 1];
+ }
+
+ // As we have already moved data from [in] to [out],
+ // next calculation will be produced in in-place mode.
+ for (i = 0; i < n_by_4; i++) {
+ OMX_F32 *out0 = out + i;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+
+ // CADD t0, out0, out2
+ t0.Re = out0[0] + out2[0];
+ t0.Im = out0[n] + out2[n];
+
+ // CSUB t1, out0, out2
+ t1.Re = out0[0] - out2[0];
+ t1.Im = out0[n] - out2[n];
+
+ // CADD t2, out1, out3
+ t2.Re = out1[0] + out3[0];
+ t2.Im = out1[n] + out3[n];
+
+ // CSUB t3, out1, out3
+ t3.Re = out1[0] - out3[0];
+ t3.Im = out1[n] - out3[n];
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CADD_SUB_X out1, t1, t3
+ out1[0] = t1.Re + t3.Im;
+ out1[n] = t1.Im - t3.Re;
+
+ // CSUB_ADD_X out3, t1, t3
+ out3[0] = t1.Re - t3.Im;
+ out3[n] = t1.Im + t3.Re;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c
new file mode 100644
index 00000000000..58908d3aa2b
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n_by_2;
+ const OMX_F32 *in2 = in1 + n_by_2;
+ const OMX_F32 *in3 = in2 + n_by_2;
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t0.imag), in0);
+ VC_LOAD_SHUFFLE(&(v_t1.real), &(v_t1.imag), in1);
+ VC_LOAD_SHUFFLE(&(v_t2.real), &(v_t2.imag), in2);
+ VC_LOAD_SHUFFLE(&(v_t3.real), &(v_t3.imag), in3);
+
+ RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c
new file mode 100644
index 00000000000..08ab35bf86a
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 2) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+ OMX_FC32 tt1;
+ OMX_FC32 tt2;
+ OMX_FC32 tt3;
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *in0 = in + (i << 1);
+ const OMX_F32 *in1 = in0 + 1;
+ const OMX_F32 *in2 = in1 + 1;
+ const OMX_F32 *in3 = in2 + 1;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CMUL tt1, tw1, in1
+ tt1.Re = tw1[0] * in1[0] - tw1[n_mul_2] * in1[n];
+ tt1.Im = tw1[0] * in1[n] + tw1[n_mul_2] * in1[0];
+
+ // CMUL tt2, tw2, in2
+ tt2.Re = tw2[0] * in2[0] - tw2[n_mul_2] * in2[n];
+ tt2.Im = tw2[0] * in2[n] + tw2[n_mul_2] * in2[0];
+
+ // CMUL tt3, tw3, in3
+ tt3.Re = tw3[0] * in3[0] - tw3[n_mul_2] * in3[n];
+ tt3.Im = tw3[0] * in3[n] + tw3[n_mul_2] * in3[0];
+
+ // CADD t0, in0, tt2
+ t0.Re = in0[0] + tt2.Re;
+ t0.Im = in0[n] + tt2.Im;
+
+ // CSUB t1, in0, tt2
+ t1.Re = in0[0] - tt2.Re;
+ t1.Im = in0[n] - tt2.Im;
+
+ // CADD t2, tt1, tt3
+ t2.Re = tt1.Re + tt3.Re;
+ t2.Im = tt1.Im + tt3.Im;
+
+ // CSUB t3, tt1, tt3
+ t3.Re = tt1.Re - tt3.Re;
+ t3.Im = tt1.Im - tt3.Im;
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CADD_SUB_X out1, t1, t3
+ out1[0] = t1.Re + t3.Im;
+ out1[n] = t1.Im - t3.Re;
+
+ // CSUB_ADD_X out3, t1, t3
+ out3[0] = t1.Re - t3.Im;
+ out3[n] = t1.Im + t3.Re;
+
+ out0 += 1;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c
new file mode 100644
index 00000000000..4fc34271809
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_INT i;
+
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *in0 = in + (i << 1);
+ const OMX_F32 *in1 = in0 + 4;
+ const OMX_F32 *in2 = in1 + 4;
+ const OMX_F32 *in3 = in2 + 4;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ v_tw1.real = _mm_set_ps(tw1[6], tw1[4], tw1[2], tw1[0]);
+ v_tw1.imag = _mm_set_ps(
+ tw1[6 + n_mul_2],
+ tw1[4 + n_mul_2],
+ tw1[2 + n_mul_2],
+ tw1[n_mul_2]);
+ v_tw2.real = _mm_set_ps(tw2[12], tw2[8], tw2[4], tw2[0]);
+ v_tw2.imag = _mm_set_ps(
+ tw2[12 + n_mul_2],
+ tw2[8 + n_mul_2],
+ tw2[4 + n_mul_2],
+ tw2[n_mul_2]);
+ v_tw3.real = _mm_set_ps(tw3[18], tw3[12], tw3[6], tw3[0]);
+ v_tw3.imag = _mm_set_ps(
+ tw3[18 + n_mul_2],
+ tw3[12 + n_mul_2],
+ tw3[6 + n_mul_2],
+ tw3[n_mul_2]);
+
+ VC_LOAD_MATRIX_TRANSPOSE(&v_t0, &v_t1, &v_t2, &v_t3, in0, in1, in2, in3, n);
+
+ RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c
new file mode 100644
index 00000000000..de2a1be7a9b
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT set;
+ OMX_INT grp;
+ OMX_INT step = sub_num >> 1;
+ OMX_INT set_count = sub_num >> 2;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_F32 *out0 = out;
+
+ // grp == 0
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+
+ const OMX_F32 *in0 = in + set;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CADD t0, in0, in2
+ t0.Re = in0[0] + in2[0];
+ t0.Im = in0[n] + in2[n];
+
+ // CSUB t1, in0, in2
+ t1.Re = in0[0] - in2[0];
+ t1.Im = in0[n] - in2[n];
+
+ // CADD t2, in1, in3
+ t2.Re = in1[0] + in3[0];
+ t2.Im = in1[n] + in3[n];
+
+ // CSUB t3, in1, in3
+ t3.Re = in1[0] - in3[0];
+ t3.Im = in1[n] - in3[n];
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out3, t1, t3
+ out3[0] = t1.Re - t3.Im;
+ out3[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out1, t1, t3
+ out1[0] = t1.Re + t3.Im;
+ out1[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+
+ // grp > 0
+ for (grp = 1; grp < sub_size; ++grp) {
+ const OMX_F32 *tw1 = twiddle + grp * step;
+ const OMX_F32 *tw2 = tw1 + grp * step;
+ const OMX_F32 *tw3 = tw2 + grp * step;
+
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+ OMX_FC32 tt1;
+ OMX_FC32 tt2;
+ OMX_FC32 tt3;
+
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CMUL tt1, Tw1, in1
+ tt1.Re = tw1[0] * in1[0] - tw1[n_mul_2] * in1[n];
+ tt1.Im = tw1[0] * in1[n] + tw1[n_mul_2] * in1[0];
+
+ // CMUL tt2, Tw2, in2
+ tt2.Re = tw2[0] * in2[0] - tw2[n_mul_2] * in2[n];
+ tt2.Im = tw2[0] * in2[n] + tw2[n_mul_2] * in2[0];
+
+ // CMUL tt3, Tw3, in3
+ tt3.Re = tw3[0] * in3[0] - tw3[n_mul_2] * in3[n];
+ tt3.Im = tw3[0] * in3[n] + tw3[n_mul_2] * in3[0];
+
+ // CADD t0, in0, tt2
+ t0.Re = in0[0] + tt2.Re;
+ t0.Im = in0[n] + tt2.Im;
+
+ // CSUB t1, in0, tt2
+ t1.Re = in0[0] - tt2.Re;
+ t1.Im = in0[n] - tt2.Im;
+
+ // CADD t2, tt1, tt3
+ t2.Re = tt1.Re + tt3.Re;
+ t2.Im = tt1.Im + tt3.Im;
+
+ // CSUB t3, tt1, tt3
+ t3.Re = tt1.Re - tt3.Re;
+ t3.Im = tt1.Im - tt3.Im;
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CADD_SUB_X out1, t1, t3
+ out1[0] = t1.Re + t3.Im;
+ out1[n] = t1.Im - t3.Re;
+
+ // CSUB_ADD_X out3, t1, t3
+ out3[0] = t1.Re - t3.Im;
+ out3[n] = t1.Im + t3.Re;
+
+ out0 += 1;
+ }
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c
new file mode 100644
index 00000000000..286f842c464
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+// This function handles the case when set_count = 2, in which we cannot
+// unroll the set loop by 4 to meet the SSE requirement (4 elements).
+static void InternalUnroll2Fwd(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *tw1e = tw1 + 4;
+ const OMX_F32 *tw2e = tw2 + 8;
+ const OMX_F32 *tw3e = tw3 + 12;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1),
+ _mm_load_ss(tw1e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2),
+ _mm_load_ss(tw1e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2),
+ _mm_load_ss(tw2e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2),
+ _mm_load_ss(tw2e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3),
+ _mm_load_ss(tw3e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2),
+ _mm_load_ss(tw3e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+
+ __m128 xmm0;
+ __m128 xmm1;
+ __m128 xmm2;
+ __m128 xmm3;
+ __m128 xmm4;
+ __m128 xmm5;
+ __m128 xmm6;
+ __m128 xmm7;
+
+ const OMX_F32 *in0 = in + (i << 1);
+ xmm0 = _mm_load_ps(in0);
+ xmm1 = _mm_load_ps(in0 + 4);
+ xmm2 = _mm_load_ps(in0 + 8);
+ xmm3 = _mm_load_ps(in0 + 12);
+ v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2));
+ v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2));
+
+ xmm4 = _mm_load_ps(in0 + n);
+ xmm5 = _mm_load_ps(in0 + n + 4);
+ xmm6 = _mm_load_ps(in0 + n + 8);
+ xmm7 = _mm_load_ps(in0 + n + 12);
+ v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+ v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2,
+ &v_tw3, &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT set;
+ OMX_INT grp;
+ OMX_INT step = sub_num >> 1;
+ OMX_INT set_count = sub_num >> 2;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+
+ OMX_F32 *out0 = out;
+
+ if (set_count == 2) {
+ InternalUnroll2Fwd(in, out, twiddle, n);
+ return;
+ }
+
+ // grp == 0
+ for (set = 0; set < set_count; set += 4) {
+ const OMX_F32 * in0 = in + set;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+
+ for (grp = 1; grp < sub_size; ++grp) {
+ const OMX_F32 *tw1 = twiddle + grp * step;
+ const OMX_F32 *tw2 = tw1 + grp * step;
+ const OMX_F32 *tw3 = tw2 + grp * step;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+
+ v_tw1.real = _mm_load1_ps(tw1);
+ v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2);
+ v_tw2.real = _mm_load1_ps(tw2);
+ v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2);
+ v_tw3.real = _mm_load1_ps(tw3);
+ v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2);
+
+ for (set = 0; set < set_count; set += 4) {
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c
new file mode 100644
index 00000000000..9f17d61b757
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i++) {
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n_by_2;
+ OMX_F32 *out1 = out0 + n_by_2;
+
+ // CADD out0, in0, in1
+ out0[0] = in0[0] + in1[0];
+ out0[n] = in0[n] + in1[n];
+
+ // CSUB out1, in0, in1
+ out1[0] = in0[0] - in1[0];
+ out1[n] = in0[n] - in1[n];
+
+ out0 += 1;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c
new file mode 100644
index 00000000000..ec545c5365a
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n; i += 2) {
+ OMX_FC32 t;
+ const OMX_F32 *tw = twiddle + i;
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + 1;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CMUL t, tw, in1
+ t.Re = tw[0] * in1[0] + tw[n << 1] * in1[n];
+ t.Im = tw[0] * in1[n] - tw[n << 1] * in1[0];
+
+ // CADD out0, in0, t
+ out0[0] = in0[0] + t.Re;
+ out0[n] = in0[n] + t.Im;
+
+ // CSUB out1, in0, t
+ out1[0] = in0[0] - t.Re;
+ out1[n] = in0[n] - t.Im;
+
+ out0 += 1;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c
new file mode 100644
index 00000000000..abad0cc998d
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_F32 *out0 =out;
+ OMX_INT i;
+
+ for (i = 0; i < n; i += 8) {
+ VC v_tw;
+ VC v_t0;
+ VC v_t1;
+ VC v_temp;
+
+ // Load twiddle
+ const OMX_F32 *tw = twiddle + i;
+ v_tw.real = _mm_set_ps(tw[6], tw[4], tw[2], tw[0]);
+ const OMX_F32 * twi = tw + (n << 1);
+ v_tw.imag = _mm_set_ps(twi[6], twi[4], twi[2], twi[0]);
+
+ // Load real part
+ const OMX_F32 *t = in + i;
+ VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t1.real), t);
+
+ // Load imag part
+ t = t + n;
+ VC_LOAD_SHUFFLE(&(v_t0.imag), &(v_t1.imag), t);
+
+ OMX_F32 *out1 = out0 + (n >> 1);
+ VC_CONJ_MUL(&v_temp, &v_tw, &v_t1);
+
+ VC_SUB_STORE_SPLIT(out1, &v_t0, &v_temp, n);
+
+ VC_ADD_STORE_SPLIT(out0, &v_t0, &v_temp, n);
+
+ out0 += 4;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c
new file mode 100644
index 00000000000..78bc9ebdb61
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT grp;
+ OMX_F32 *out0 = out;
+ OMX_INT set_count = sub_num >> 1;
+
+ for (grp = 0; grp < sub_size; ++grp) {
+ OMX_INT set;
+ const OMX_F32 *tw = twiddle + grp * sub_num;
+
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t;
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CMUL t, tw, in1
+ t.Re = tw[0] * in1[0] + tw[n << 1] * in1[n];
+ t.Im = tw[0] * in1[n] - tw[n << 1] * in1[0];
+
+ // CADD out0, in0, t
+ out0[0] = in0[0] + t.Re;
+ out0[n] = in0[n] + t.Im;
+
+ // CSUB out1, in0, t
+ out1[0] = in0[0] - t.Re;
+ out1[n] = in0[n] - t.Im;
+
+ out0 += 1;
+ }
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c
new file mode 100644
index 00000000000..bb80fa30830
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_4; i++) {
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n_by_4;
+ const OMX_F32 *in2 = in1 + n_by_4;
+ const OMX_F32 *in3 = in2 + n_by_4;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+
+ // CADD t0, in0, in2
+ t0.Re = in0[0] + in2[0];
+ t0.Im = in0[n] + in2[n];
+
+ // CSUB t1, in0, in2
+ t1.Re = in0[0] - in2[0];
+ t1.Im = in0[n] - in2[n];
+
+ // CADD t2, in1, in3
+ t2.Re = in1[0] + in3[0];
+ t2.Im = in1[n] + in3[n];
+
+ // CSUB t3, in1, in3
+ t3.Re = in1[0] - in3[0];
+ t3.Im = in1[n] - in3[n];
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out1, t1, t3
+ out1[0] = t1.Re - t3.Im;
+ out1[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out3, t1, t3
+ out3[0] = t1.Re + t3.Im;
+ out3[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c
new file mode 100644
index 00000000000..c3921bc46a4
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_4; i += 4) {
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n_by_4;
+ const OMX_F32 *in2 = in1 + n_by_4;
+ const OMX_F32 *in3 = in2 + n_by_4;
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c
new file mode 100644
index 00000000000..705d9cbc342
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 2) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+ OMX_FC32 tt1;
+ OMX_FC32 tt2;
+ OMX_FC32 tt3;
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *in0 = in + (i << 1);
+ const OMX_F32 *in1 = in0 + 1;
+ const OMX_F32 *in2 = in1 + 1;
+ const OMX_F32 *in3 = in2 + 1;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CMUL tt1, Tw1, in1
+ tt1.Re = tw1[0] * in1[0] + tw1[n_mul_2] * in1[n];
+ tt1.Im = tw1[0] * in1[n] - tw1[n_mul_2] * in1[0];
+
+ // CMUL tt2, Tw2, in2
+ tt2.Re = tw2[0] * in2[0] + tw2[n_mul_2] * in2[n];
+ tt2.Im = tw2[0] * in2[n] - tw2[n_mul_2] * in2[0];
+
+ // CMUL tt3, Tw3, in3
+ tt3.Re = tw3[0] * in3[0] + tw3[n_mul_2] * in3[n];
+ tt3.Im = tw3[0] * in3[n] - tw3[n_mul_2] * in3[0];
+
+ // CADD t0, in0, tt2
+ t0.Re = in0[0] + tt2.Re;
+ t0.Im = in0[n] + tt2.Im;
+
+ // CSUB t1, in0, tt2
+ t1.Re = in0[0] - tt2.Re;
+ t1.Im = in0[n] - tt2.Im;
+
+ // CADD t2, tt1, tt3
+ t2.Re = tt1.Re + tt3.Re;
+ t2.Im = tt1.Im + tt3.Im;
+
+ // CSUB t3, tt1, tt3
+ t3.Re = tt1.Re - tt3.Re;
+ t3.Im = tt1.Im - tt3.Im;
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out1, t1, t3
+ out1[0] = t1.Re - t3.Im;
+ out1[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out3, t1, t3
+ out3[0] = t1.Re + t3.Im;
+ out3[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c
new file mode 100644
index 00000000000..2e245faf1a5
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_INT i;
+
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *in0 = in + (i << 1);
+ const OMX_F32 *in1 = in0 + 4;
+ const OMX_F32 *in2 = in1 + 4;
+ const OMX_F32 *in3 = in2 + 4;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ v_tw1.real = _mm_set_ps(tw1[6], tw1[4], tw1[2], tw1[0]);
+ v_tw1.imag = _mm_set_ps(
+ tw1[6 + n_mul_2],
+ tw1[4 + n_mul_2],
+ tw1[2 + n_mul_2],
+ tw1[n_mul_2]);
+ v_tw2.real = _mm_set_ps(tw2[12], tw2[8], tw2[4], tw2[0]);
+ v_tw2.imag = _mm_set_ps(
+ tw2[12 + n_mul_2],
+ tw2[8 + n_mul_2],
+ tw2[4 + n_mul_2],
+ tw2[n_mul_2]);
+ v_tw3.real = _mm_set_ps(tw3[18], tw3[12], tw3[6], tw3[0]);
+ v_tw3.imag = _mm_set_ps(
+ tw3[18 + n_mul_2],
+ tw3[12 + n_mul_2],
+ tw3[6 + n_mul_2],
+ tw3[n_mul_2]);
+
+ VC_LOAD_MATRIX_TRANSPOSE(&v_t0, &v_t1, &v_t2, &v_t3, in0, in1, in2, in3, n);
+
+ RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c
new file mode 100644
index 00000000000..499036b9347
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT set;
+ OMX_INT grp;
+ OMX_INT step = sub_num >> 1;
+ OMX_INT set_count = sub_num >> 2;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_F32 *out0 = out;
+
+ // grp == 0
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+
+ const OMX_F32 *in0 = in + set;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CADD t0, in0, in2
+ t0.Re = in0[0] + in2[0];
+ t0.Im = in0[n] + in2[n];
+
+ // CSUB t1, in0, in2
+ t1.Re = in0[0] - in2[0];
+ t1.Im = in0[n] - in2[n];
+
+ // CADD t2, in1, in3
+ t2.Re = in1[0] + in3[0];
+ t2.Im = in1[n] + in3[n];
+
+ // CSUB t3, in1, in3
+ t3.Re = in1[0] - in3[0];
+ t3.Im = in1[n] - in3[n];
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out1, t1, t3
+ out1[0] = t1.Re - t3.Im;
+ out1[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out3, t1, t3
+ out3[0] = t1.Re + t3.Im;
+ out3[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+
+ // grp > 0
+ for (grp = 1; grp < sub_size; ++grp) {
+ const OMX_F32 *tw1 = twiddle + grp * step;
+ const OMX_F32 *tw2 = tw1 + grp * step;
+ const OMX_F32 *tw3 = tw2 + grp * step;
+
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+ OMX_FC32 tt1;
+ OMX_FC32 tt2;
+ OMX_FC32 tt3;
+
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CMUL tt1, Tw1, in1
+ tt1.Re = tw1[0] * in1[0] + tw1[n_mul_2] * in1[n];
+ tt1.Im = tw1[0] * in1[n] - tw1[n_mul_2] * in1[0];
+
+ // CMUL tt2, Tw2, in2
+ tt2.Re = tw2[0] * in2[0] + tw2[n_mul_2] * in2[n];
+ tt2.Im = tw2[0] * in2[n] - tw2[n_mul_2] * in2[0];
+
+ // CMUL tt3, Tw3, in3
+ tt3.Re = tw3[0] * in3[0] + tw3[n_mul_2] * in3[n];
+ tt3.Im = tw3[0] * in3[n] - tw3[n_mul_2] * in3[0];
+
+ // CADD t0, in0, tt2
+ t0.Re = in0[0] + tt2.Re;
+ t0.Im = in0[n] + tt2.Im;
+
+ // CSUB t1, in0, tt2
+ t1.Re = in0[0] - tt2.Re;
+ t1.Im = in0[n] - tt2.Im;
+
+ // CADD t2, tt1, tt3
+ t2.Re = tt1.Re + tt3.Re;
+ t2.Im = tt1.Im + tt3.Im;
+
+ // CSUB t3, tt1, tt3
+ t3.Re = tt1.Re - tt3.Re;
+ t3.Im = tt1.Im - tt3.Im;
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out1, t1, t3
+ out1[0] = t1.Re - t3.Im;
+ out1[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out3, t1, t3
+ out3[0] = t1.Re + t3.Im;
+ out3[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c
new file mode 100644
index 00000000000..703f316920f
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+// This function handles the case when set_count = 2, in which we cannot
+// unroll the set loop by 4 to meet the SSE requirement (4 elements).
+static void InternalUnroll2Inv(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *tw1e = tw1 + 4;
+ const OMX_F32 *tw2e = tw2 + 8;
+ const OMX_F32 *tw3e = tw3 + 12;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1),
+ _mm_load_ss(tw1e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2),
+ _mm_load_ss(tw1e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2),
+ _mm_load_ss(tw2e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2),
+ _mm_load_ss(tw2e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3),
+ _mm_load_ss(tw3e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2),
+ _mm_load_ss(tw3e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+
+ __m128 xmm0;
+ __m128 xmm1;
+ __m128 xmm2;
+ __m128 xmm3;
+ __m128 xmm4;
+ __m128 xmm5;
+ __m128 xmm6;
+ __m128 xmm7;
+
+ const OMX_F32 *in0 = in + (i << 1);
+ xmm0 = _mm_load_ps(in0);
+ xmm1 = _mm_load_ps(in0 + 4);
+ xmm2 = _mm_load_ps(in0 + 8);
+ xmm3 = _mm_load_ps(in0 + 12);
+ v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2));
+ v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2));
+
+ xmm4 = _mm_load_ps(in0 + n);
+ xmm5 = _mm_load_ps(in0 + n + 4);
+ xmm6 = _mm_load_ps(in0 + n + 8);
+ xmm7 = _mm_load_ps(in0 + n + 12);
+ v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+ v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT set;
+ OMX_INT grp;
+ OMX_INT step = sub_num >> 1;
+ OMX_INT set_count = sub_num >> 2;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+
+ OMX_F32 *out0 = out;
+
+ if (set_count == 2) {
+ InternalUnroll2Inv(in, out, twiddle, n);
+ return;
+ }
+
+ // grp == 0
+ for (set = 0; set < set_count; set += 4) {
+ const OMX_F32 * in0 = in + set;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+
+ for (grp = 1; grp < sub_size; ++grp) {
+ const OMX_F32 *tw1 = twiddle + grp * step;
+ const OMX_F32 *tw2 = tw1 + grp * step;
+ const OMX_F32 *tw3 = tw2 + grp * step;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+
+ v_tw1.real = _mm_load1_ps(tw1);
+ v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2);
+ v_tw2.real = _mm_load1_ps(tw2);
+ v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2);
+ v_tw3.real = _mm_load1_ps(tw3);
+ v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2);
+
+ for (set = 0; set < set_count; set += 4) {
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+ }
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c
new file mode 100644
index 00000000000..0a3d816ffe4
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include <stdbool.h>
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix2_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix2_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix2_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
+ const OMX_F32 *src,
+ // Two Ping Pong buffers for out of place kernel.
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft) {
+ OMX_INT sub_size;
+ OMX_INT sub_num;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_F32 *in = buf1;
+ OMX_F32 *out = buf2;
+
+ if (forward_fft)
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(src, in, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix2_fs(src, in, n);
+
+ for (sub_size = 2, sub_num = n_by_2;
+ sub_size < n_by_2;
+ sub_size = sub_size << 1, sub_num = sub_num >> 1) {
+
+ if (forward_fft) {
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(in, out, twiddle,
+ n, sub_size, sub_num);
+ } else {
+ x86SP_FFT_CToC_FC32_Inv_Radix2_ms(in, out, twiddle,
+ n, sub_size, sub_num);
+ }
+
+ OMX_F32 *temp = out;
+ out = in;
+ in = temp;
+ }
+
+ // If sub_num <= 1, no need to do the last stage.
+ if (sub_num <= 1)
+ return in;
+
+ if (forward_fft)
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix2_ls(in, out, twiddle, n);
+
+ return out;
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c
new file mode 100644
index 00000000000..e7c7b892724
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+#include "dl/api/omxtypes.h"
+#include <stdbool.h>
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft) {
+ OMX_INT sub_size;
+ OMX_INT sub_num;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *in = buf1;
+ OMX_F32 *out = buf2;
+
+ if (forward_fft)
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(src, in, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix4_fs(src, in, n);
+
+ for (sub_size = 4, sub_num = n_by_4;
+ sub_size < n_by_4;
+ sub_size = sub_size << 2, sub_num = sub_num >> 2) {
+
+ if (forward_fft) {
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(in, out, twiddle,
+ n, sub_size, sub_num);
+ } else {
+ x86SP_FFT_CToC_FC32_Inv_Radix4_ms(in, out, twiddle,
+ n, sub_size, sub_num);
+ }
+
+ OMX_F32 *temp = out;
+ out = in;
+ in = temp;
+ }
+
+ if (forward_fft) {
+ if (sub_num == 2)
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(in, out, twiddle, n);
+ } else {
+ if (sub_num == 2)
+ x86SP_FFT_CToC_FC32_Inv_Radix2_ls(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix4_ls(in, out, twiddle, n);
+ }
+
+ return out;
+}
+
+OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ // true for forward, false for inverse.
+ bool forward_fft) {
+ OMX_INT sub_size, sub_num;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *in, *out;
+ in = buf1;
+ out = buf2;
+
+ if (forward_fft)
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(src, in, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(src, in, n);
+
+ for (sub_size = 4, sub_num = n_by_4;
+ sub_size < n_by_4;
+ sub_size = sub_size << 2, sub_num = sub_num >> 2) {
+
+ if (forward_fft) {
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(in, out, twiddle,
+ n, sub_size, sub_num);
+ } else {
+ x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(in, out, twiddle,
+ n, sub_size, sub_num);
+ }
+
+ OMX_F32 *temp = out;
+ out = in;
+ in = temp;
+ }
+
+ // If n is not power of 4, sub_num == 2.
+ if (forward_fft) {
+ if (sub_num == 2)
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(in, out, twiddle, n);
+ } else {
+ if (sub_num == 2)
+ x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(in, out, twiddle, n);
+ }
+
+ return out;
+}
diff --git a/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_SSE_Math.h b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_SSE_Math.h
new file mode 100644
index 00000000000..d10a851ae7a
--- /dev/null
+++ b/chromium/third_party/openmax_dl/dl/sp/src/x86/x86SP_SSE_Math.h
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights realserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+/**
+ * Two data formats are used by the FFT routines, internally. The
+ * interface to the main external FFT routines use interleaved complex
+ * values where the real part is followed by the imaginary part.
+ *
+ * One is the split format where a complex vector of real and imaginary
+ * values are split such that all of the real values are placed in the
+ * first half of the vector and the corresponding values are placed in
+ * the second half, in the same order. The conversion from interleaved
+ * complex values to split format and back is transparent to the
+ * external FFT interface.
+ *
+ * VComplex uses split format.
+ */
+
+/** VComplex hold 4 complex float elements, with the real parts stored
+ * in real and corresponding imaginary parts in imag.
+ */
+typedef struct VComplex {
+ __m128 real;
+ __m128 imag;
+} VC;
+
+/* out = a * b */
+static inline void VC_MUL(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real),
+ _mm_mul_ps(a->imag, b->imag));
+ out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag),
+ _mm_mul_ps(a->imag, b->real));
+}
+
+/* out = conj(a) * b */
+static inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real),
+ _mm_mul_ps(a->imag, b->imag));
+ out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag),
+ _mm_mul_ps(a->imag, b->real));
+}
+
+/* Scale complex by a real factor */
+static inline void VC_MUL_F(VC *out, VC *a, __m128 factor) {
+ out->real = _mm_mul_ps(factor, a->real);
+ out->imag = _mm_mul_ps(factor, a->imag);
+}
+
+/* out = a + b */
+static inline void VC_ADD(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(a->real, b->real);
+ out->imag = _mm_add_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real + b.imag
+ * out.imag = a.imag + b.real
+ */
+static inline void VC_ADD_X(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(a->real, b->imag);
+ out->imag = _mm_add_ps(b->real, a->imag);
+}
+
+/* VC_ADD and store the result with Split format. */
+static inline void VC_ADD_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a,
+ VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_add_ps(a->real, b->real));
+ _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag));
+}
+
+/* out = a - b */
+static inline void VC_SUB(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(a->real, b->real);
+ out->imag = _mm_sub_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real - b.imag
+ * out.imag = a.imag - b.real
+ */
+static inline void VC_SUB_X(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(a->real, b->imag);
+ out->imag = _mm_sub_ps(b->real, a->imag);
+}
+
+/* VC_SUB and store the result with Split format. */
+static inline void VC_SUB_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a,
+ VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_sub_ps(a->real, b->real));
+ _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag));
+}
+
+/**
+ * out.real = a.real + b.real
+ * out.imag = a.imag - b.imag
+ */
+static inline void VC_ADD_SUB(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(a->real, b->real);
+ out->imag = _mm_sub_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real + b.imag
+ * out.imag = a.imag - b.real
+ */
+static inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(a->real, b->imag);
+ out->imag = _mm_sub_ps(a->imag, b->real);
+}
+
+/* VC_ADD_SUB_X and store the result with Split format. */
+static inline void VC_ADD_SUB_X_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a,
+ VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
+ _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real));
+}
+
+/**
+ * out.real = a.real - b.real
+ * out.imag = a.imag + b.imag
+ */
+static inline void VC_SUB_ADD(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(a->real, b->real);
+ out->imag = _mm_add_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real - b.imag
+ * out.imag = a.imag + b.real
+ */
+static inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(a->real, b->imag);
+ out->imag = _mm_add_ps(a->imag, b->real);
+}
+
+/* VC_SUB_ADD_X and store the result with Split format. */
+static inline void VC_SUB_ADD_X_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a, VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_sub_ps(a->real, b->imag));
+ _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real));
+}
+
+/**
+ * out[0] = in.real
+ * out[offset] = in.imag
+ */
+static inline void VC_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *in,
+ OMX_INT offset) {
+ _mm_store_ps(out, in->real);
+ _mm_store_ps(out + offset, in->imag);
+}
+
+/**
+ * out.real = in[0];
+ * out.imag = in[offset];
+*/
+static inline void VC_LOAD_SPLIT(
+ VC *out,
+ const OMX_F32 *in,
+ OMX_INT offset) {
+ out->real = _mm_load_ps(in);
+ out->imag = _mm_load_ps(in + offset);
+}
+
+/* Vector Complex Unpack from Split format to Interleaved format. */
+static inline void VC_UNPACK(VC *out, VC *in) {
+ out->real = _mm_unpacklo_ps(in->real, in->imag);
+ out->imag = _mm_unpackhi_ps(in->real, in->imag);
+}
+
+/**
+ * Vector Complex load from interleaved complex array.
+ * out.real = [in[0].real, in[1].real, in[2].real, in[3].real]
+ * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag]
+ */
+static inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) {
+ __m128 temp0 = _mm_load_ps(in);
+ __m128 temp1 = _mm_load_ps(in + 4);
+ out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));
+ out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));
+}
+/**
+ * Vector Complex Load with Split format.
+ * The input address is not 16 byte aligned.
+ */
+static inline void VC_LOADU_SPLIT(
+ VC *out,
+ const OMX_F32 *in,
+ OMX_INT offset) {
+ out->real = _mm_loadu_ps(in);
+ out->imag = _mm_loadu_ps(in + offset);
+}
+
+/* Reverse the order of the Complex Vector. */
+static inline void VC_REVERSE(VC *v) {
+ v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3));
+ v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3));
+}
+/*
+ * Vector Complex store to interleaved complex array
+ * out[0] = in.real[0]
+ * out[1] = in.imag[0]
+ * out[2] = in.real[1]
+ * out[3] = in.imag[1]
+ * out[4] = in.real[2]
+ * out[5] = in.imag[2]
+ * out[6] = in.real[3]
+ * out[7] = in.imag[3]
+ */
+static inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) {
+ _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag));
+ _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
+}
+
+/**
+ * Vector Complex Store with Interleaved format.
+ * Address is not 16 byte aligned.
+ */
+static inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) {
+ _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag));
+ _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
+}
+
+/* VC_ADD_X and store the result with Split format. */
+static inline void VC_ADD_X_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a, VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
+ _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag));
+}
+
+/**
+ * VC_SUB_X and store the result with inverse order.
+ * Address is not 16 byte aligned.
+ */
+static inline void VC_SUB_X_INVERSE_STOREU_SPLIT(
+ OMX_F32 *out,
+ VC *a,
+ VC *b,
+ OMX_INT offset) {
+ __m128 t;
+ t = _mm_sub_ps(a->real, b->imag);
+ _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
+ t = _mm_sub_ps(b->real, a->imag);
+ _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+/**
+ * Vector Complex Load from Interleaved format to Split format.
+ * Store the result into two __m128 registers.
+ */
+static inline void VC_LOAD_SHUFFLE(
+ __m128 *out0,
+ __m128 *out1,
+ const OMX_F32 *in) {
+ VC temp;
+ VC_LOAD_INTERLEAVE(&temp, in);
+ *out0 = temp.real;
+ *out1 = temp.imag;
+}
+
+/* Finish the butterfly calculation of forward radix4 and store the outputs. */
+static inline void RADIX4_FWD_BUTTERFLY_STORE(
+ OMX_F32 *out0,
+ OMX_F32 *out1,
+ OMX_F32 *out2,
+ OMX_F32 *out3,
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ OMX_INT n) {
+ /* CADD out0, t0, t2 */
+ VC_ADD_STORE_SPLIT(out0, t0, t2, n);
+
+ /* CSUB out2, t0, t2 */
+ VC_SUB_STORE_SPLIT(out2, t0, t2, n);
+
+ /* CADD_SUB_X out1, t1, t3 */
+ VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n);
+
+ /* CSUB_ADD_X out3, t1, t3 */
+ VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n);
+}
+
+/* Finish the butterfly calculation of inverse radix4 and store the outputs. */
+static inline void RADIX4_INV_BUTTERFLY_STORE(
+ OMX_F32 *out0,
+ OMX_F32 *out1,
+ OMX_F32 *out2,
+ OMX_F32 *out3,
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ OMX_INT n) {
+ /* CADD out0, t0, t2 */
+ VC_ADD_STORE_SPLIT(out0, t0, t2, n);
+
+ /* CSUB out2, t0, t2 */
+ VC_SUB_STORE_SPLIT(out2, t0, t2, n);
+
+ /* CSUB_ADD_X out1, t1, t3 */
+ VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n);
+
+ /* CADD_SUB_X out3, t1, t3 */
+ VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n);
+}
+
+/* Radix4 forward butterfly */
+static inline void RADIX4_FWD_BUTTERFLY(
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ VC *Tw1,
+ VC *Tw2,
+ VC *Tw3,
+ VC *T0,
+ VC *T1,
+ VC *T2,
+ VC *T3) {
+ VC tt1, tt2, tt3;
+
+ /* CMUL tt1, Tw1, T1 */
+ VC_MUL(&tt1, Tw1, T1);
+
+ /* CMUL tt2, Tw2, T2 */
+ VC_MUL(&tt2, Tw2, T2);
+
+ /* CMUL tt3, Tw3, T3 */
+ VC_MUL(&tt3, Tw3, T3);
+
+ /* CADD t0, T0, tt2 */
+ VC_ADD(t0, T0, &tt2);
+
+ /* CSUB t1, T0, tt2 */
+ VC_SUB(t1, T0, &tt2);
+
+ /* CADD t2, tt1, tt3 */
+ VC_ADD(t2, &tt1, &tt3);
+
+ /* CSUB t3, tt1, tt3 */
+ VC_SUB(t3, &tt1, &tt3);
+}
+
+/* Radix4 inverse butterfly */
+static inline void RADIX4_INV_BUTTERFLY(
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ VC *Tw1,
+ VC *Tw2,
+ VC *Tw3,
+ VC *T0,
+ VC *T1,
+ VC *T2,
+ VC *T3) {
+ VC tt1, tt2, tt3;
+
+ /* CMUL tt1, Tw1, T1 */
+ VC_CONJ_MUL(&tt1, Tw1, T1);
+
+ /* CMUL tt2, Tw2, T2 */
+ VC_CONJ_MUL(&tt2, Tw2, T2);
+
+ /* CMUL tt3, Tw3, T3 */
+ VC_CONJ_MUL(&tt3, Tw3, T3);
+
+ /* CADD t0, T0, tt2 */
+ VC_ADD(t0, T0, &tt2);
+
+ /* CSUB t1, T0, tt2 */
+ VC_SUB(t1, T0, &tt2);
+
+ /* CADD t2, tt1, tt3 */
+ VC_ADD(t2, &tt1, &tt3);
+
+ /* CSUB t3, tt1, tt3 */
+ VC_SUB(t3, &tt1, &tt3);
+}
+
+/* Radix4 butterfly in first stage for both forward and inverse */
+static inline void RADIX4_BUTTERFLY_FS(
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ VC *T0,
+ VC *T1,
+ VC *T2,
+ VC *T3) {
+ /* CADD t0, T0, T2 */
+ VC_ADD(t0, T0, T2);
+
+ /* CSUB t1, T0, T2 */
+ VC_SUB(t1, T0, T2);
+
+ /* CADD t2, T1, T3 */
+ VC_ADD(t2, T1, T3);
+
+ /* CSUB t3, T1, T3 */
+ VC_SUB(t3, T1, T3);
+}
+
+/**
+ * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix.
+ * Then Do transpose on the matrix.
+ * 3, 2, 1, 0 12, 8, 4, 0
+ * 7, 6, 5, 4 =====> 13, 9, 5, 1
+ * 11, 10, 9, 8 14, 10, 6, 2
+ * 15, 14, 13, 12 15, 11, 7, 3
+ */
+static inline void VC_LOAD_MATRIX_TRANSPOSE(
+ VC *T0,
+ VC *T1,
+ VC *T2,
+ VC *T3,
+ const OMX_F32 *pT0,
+ const OMX_F32 *pT1,
+ const OMX_F32 *pT2,
+ const OMX_F32 *pT3,
+ OMX_INT n) {
+ __m128 xmm0;
+ __m128 xmm1;
+ __m128 xmm2;
+ __m128 xmm3;
+ __m128 xmm4;
+ __m128 xmm5;
+ __m128 xmm6;
+ __m128 xmm7;
+
+ xmm0 = _mm_load_ps(pT0);
+ xmm1 = _mm_load_ps(pT1);
+ xmm2 = _mm_load_ps(pT2);
+ xmm3 = _mm_load_ps(pT3);
+
+ /* Matrix transpose */
+ xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
+ xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
+ xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
+ xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
+ T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+ T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+ T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+ T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+
+ xmm0 = _mm_load_ps(pT0 + n);
+ xmm1 = _mm_load_ps(pT1 + n);
+ xmm2 = _mm_load_ps(pT2 + n);
+ xmm3 = _mm_load_ps(pT3 + n);
+
+ /* Matrix transpose */
+ xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
+ xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
+ xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
+ xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
+ T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+ T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+ T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+ T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+}