Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

527 lines
24 KiB

  1. /*-========================================================================-_
  2. | - XDSP - |
  3. | Copyright (c) Microsoft Corporation. All rights reserved. |
  4. |~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
  5. |PROJECT: XDSP MODEL: Unmanaged User-mode |
  6. |VERSION: 1.0 EXCEPT: No Exceptions |
  7. |CLASS: N / A MINREQ: WinXP, Xbox360 |
  8. |BASE: N / A DIALECT: MSC++ 14.00 |
  9. |>------------------------------------------------------------------------<|
  10. | DUTY: DSP functions with CPU extension specific optimizations |
  11. ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
  12. NOTES:
  13. 1. Definition of terms:
  14. DSP: Digital Signal Processing.
  15. FFT: Fast Fourier Transform.
  16. 2. All buffer parameters must be 16-byte aligned.
  17. 3. All FFT functions support only FLOAT32 mono audio. */
  18. #pragma once
  19. //--------------<D-E-F-I-N-I-T-I-O-N-S>-------------------------------------//
  20. #include <windef.h> // general windows types
  21. #include <math.h> // trigonometric functions
  22. #if defined(_XBOX) // SIMD intrinsics
  23. #include <ppcintrinsics.h>
  24. #else
  25. #include <emmintrin.h>
  26. #endif
  27. //--------------<M-A-C-R-O-S>-----------------------------------------------//
  28. // assertion
  29. #if !defined(DSPASSERT)
  30. #if DBG
  31. #define DSPASSERT(exp) if (!(exp)) { OutputDebugStringA("XDSP ASSERT: " #exp ", {" __FUNCTION__ "}\n"); __debugbreak(); }
  32. #else
  33. #define DSPASSERT(exp) __assume(exp)
  34. #endif
  35. #endif
  36. // true if n is a power of 2
  37. #if !defined(ISPOWEROF2)
  38. #define ISPOWEROF2(n) ( ((n)&((n)-1)) == 0 && (n) != 0 )
  39. #endif
  40. //--------------<H-E-L-P-E-R-S>---------------------------------------------//
  41. namespace XDSP {
  42. #pragma warning(push)
  43. #pragma warning(disable: 4328 4640) // disable "indirection alignment of formal parameter", "construction of local static object is not thread-safe" compile warnings
  44. // Helper functions, used by the FFT functions.
  45. // The application need not call them directly.
  46. // primitive types
  47. typedef __m128 XVECTOR;
  48. typedef XVECTOR& XVECTORREF;
  49. // Parallel multiplication of four complex numbers, assuming
  50. // real and imaginary values are stored in separate vectors.
  51. __forceinline void vmulComplex (__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREF r1, __in XVECTORREF i1, __in XVECTORREF r2, __in XVECTORREF i2)
  52. {
  53. // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
  54. XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
  55. XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
  56. XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
  57. XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
  58. rResult = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
  59. iResult = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
  60. }
  61. __forceinline void vmulComplex (__inout XVECTORREF r1, __inout XVECTORREF i1, __in XVECTORREF r2, __in XVECTORREF i2)
  62. {
  63. // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
  64. XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
  65. XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
  66. XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
  67. XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
  68. r1 = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2)
  69. i1 = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
  70. }
  71. // Radix-4 decimation-in-time FFT butterfly.
  72. // This version assumes that all four elements of the butterfly are
  73. // adjacent in a single vector.
  74. //
  75. // Compute the product of the complex input vector and the
  76. // 4-element DFT matrix:
  77. // | 1 1 1 1 | | (r1X,i1X) |
  78. // | 1 -j -1 j | | (r1Y,i1Y) |
  79. // | 1 -1 1 -1 | | (r1Z,i1Z) |
  80. // | 1 j -1 -j | | (r1W,i1W) |
  81. //
  82. // This matrix can be decomposed into two simpler ones to reduce the
  83. // number of additions needed. The decomposed matrices look like this:
  84. // | 1 0 1 0 | | 1 0 1 0 |
  85. // | 0 1 0 -j | | 1 0 -1 0 |
  86. // | 1 0 -1 0 | | 0 1 0 1 |
  87. // | 0 1 0 j | | 0 1 0 -1 |
  88. //
  89. // Combine as follows:
  90. // | 1 0 1 0 | | (r1X,i1X) | | (r1X + r1Z, i1X + i1Z) |
  91. // Temp = | 1 0 -1 0 | * | (r1Y,i1Y) | = | (r1X - r1Z, i1X - i1Z) |
  92. // | 0 1 0 1 | | (r1Z,i1Z) | | (r1Y + r1W, i1Y + i1W) |
  93. // | 0 1 0 -1 | | (r1W,i1W) | | (r1Y - r1W, i1Y - i1W) |
  94. //
  95. // | 1 0 1 0 | | (rTempX,iTempX) | | (rTempX + rTempZ, iTempX + iTempZ) |
  96. // Result = | 0 1 0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) |
  97. // | 1 0 -1 0 | | (rTempZ,iTempZ) | | (rTempX - rTempZ, iTempX - iTempZ) |
  98. // | 0 1 0 j | | (rTempW,iTempW) | | (rTempY - iTempW, iTempY + rTempW) |
  99. __forceinline void ButterflyDIT4_1 (__inout XVECTORREF r1, __inout XVECTORREF i1)
  100. {
  101. // sign constants for radix-4 butterflies
  102. const static XVECTOR vDFT4SignBits1 = { 0.0f, -0.0f, 0.0f, -0.0f };
  103. const static XVECTOR vDFT4SignBits2 = { 0.0f, 0.0f, -0.0f, -0.0f };
  104. const static XVECTOR vDFT4SignBits3 = { 0.0f, -0.0f, -0.0f, 0.0f };
  105. // calculating Temp
  106. XVECTOR rTemp = _mm_add_ps( _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1, 1, 0, 0)), // [r1X| r1X|r1Y| r1Y] +
  107. _mm_xor_ps(_mm_shuffle_ps(r1, r1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [r1Z|-r1Z|r1W|-r1W]
  108. XVECTOR iTemp = _mm_add_ps( _mm_shuffle_ps(i1, i1, _MM_SHUFFLE(1, 1, 0, 0)), // [i1X| i1X|i1Y| i1Y] +
  109. _mm_xor_ps(_mm_shuffle_ps(i1, i1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [i1Z|-i1Z|i1W|-i1W]
  110. // calculating Result
  111. XVECTOR rZrWiZiW = _mm_shuffle_ps(rTemp, iTemp, _MM_SHUFFLE(3, 2, 3, 2)); // [rTempZ|rTempW|iTempZ|iTempW]
  112. XVECTOR rZiWrZiW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(3, 0, 3, 0)); // [rTempZ|iTempW|rTempZ|iTempW]
  113. XVECTOR iZrWiZrW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(1, 2, 1, 2)); // [rTempZ|iTempW|rTempZ|iTempW]
  114. r1 = _mm_add_ps( _mm_shuffle_ps(rTemp, rTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [rTempX| rTempY| rTempX| rTempY] +
  115. _mm_xor_ps(rZiWrZiW, vDFT4SignBits2) ); // [rTempZ| iTempW|-rTempZ|-iTempW]
  116. i1 = _mm_add_ps( _mm_shuffle_ps(iTemp, iTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [iTempX| iTempY| iTempX| iTempY] +
  117. _mm_xor_ps(iZrWiZrW, vDFT4SignBits3) ); // [iTempZ|-rTempW|-iTempZ| rTempW]
  118. }
  119. // Radix-4 decimation-in-time FFT butterfly.
  120. // This version assumes that elements of the butterfly are
  121. // in different vectors, so that each vector in the input
  122. // contains elements from four different butterflies.
  123. // The four separate butterflies are processed in parallel.
  124. //
  125. // The calculations here are the same as the ones in the single-vector
  126. // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W)
  127. // they are done in parallel on sixteen independent complex values.
  128. // There is no interdependence between the vector elements:
  129. // | 1 0 1 0 | | (rIn0,iIn0) | | (rIn0 + rIn2, iIn0 + iIn2) |
  130. // | 1 0 -1 0 | * | (rIn1,iIn1) | = Temp = | (rIn0 - rIn2, iIn0 - iIn2) |
  131. // | 0 1 0 1 | | (rIn2,iIn2) | | (rIn1 + rIn3, iIn1 + iIn3) |
  132. // | 0 1 0 -1 | | (rIn3,iIn3) | | (rIn1 - rIn3, iIn1 - iIn3) |
  133. //
  134. // | 1 0 1 0 | | (rTemp0,iTemp0) | | (rTemp0 + rTemp2, iTemp0 + iTemp2) |
  135. // Result = | 0 1 0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) |
  136. // | 1 0 -1 0 | | (rTemp2,iTemp2) | | (rTemp0 - rTemp2, iTemp0 - iTemp2) |
  137. // | 0 1 0 j | | (rTemp3,iTemp3) | | (rTemp1 - iTemp3, iTemp1 + rTemp3) |
  138. __forceinline void ButterflyDIT4_4 (__inout XVECTORREF r0,
  139. __inout XVECTORREF r1,
  140. __inout XVECTORREF r2,
  141. __inout XVECTORREF r3,
  142. __inout XVECTORREF i0,
  143. __inout XVECTORREF i1,
  144. __inout XVECTORREF i2,
  145. __inout XVECTORREF i3,
  146. __in_ecount(uStride*4) XVECTOR* __restrict pUnityTableReal,
  147. __in_ecount(uStride*4) XVECTOR* __restrict pUnityTableImaginary,
  148. const UINT32 uStride, const BOOL fLast)
  149. {
  150. DSPASSERT(pUnityTableReal != NULL);
  151. DSPASSERT(pUnityTableImaginary != NULL);
  152. DSPASSERT((UINT_PTR)pUnityTableReal % 16 == 0);
  153. DSPASSERT((UINT_PTR)pUnityTableImaginary % 16 == 0);
  154. DSPASSERT(ISPOWEROF2(uStride));
  155. XVECTOR rTemp0, rTemp1, rTemp2, rTemp3, rTemp4, rTemp5, rTemp6, rTemp7;
  156. XVECTOR iTemp0, iTemp1, iTemp2, iTemp3, iTemp4, iTemp5, iTemp6, iTemp7;
  157. // calculating Temp
  158. rTemp0 = _mm_add_ps(r0, r2); iTemp0 = _mm_add_ps(i0, i2);
  159. rTemp2 = _mm_add_ps(r1, r3); iTemp2 = _mm_add_ps(i1, i3);
  160. rTemp1 = _mm_sub_ps(r0, r2); iTemp1 = _mm_sub_ps(i0, i2);
  161. rTemp3 = _mm_sub_ps(r1, r3); iTemp3 = _mm_sub_ps(i1, i3);
  162. rTemp4 = _mm_add_ps(rTemp0, rTemp2); iTemp4 = _mm_add_ps(iTemp0, iTemp2);
  163. rTemp5 = _mm_add_ps(rTemp1, iTemp3); iTemp5 = _mm_sub_ps(iTemp1, rTemp3);
  164. rTemp6 = _mm_sub_ps(rTemp0, rTemp2); iTemp6 = _mm_sub_ps(iTemp0, iTemp2);
  165. rTemp7 = _mm_sub_ps(rTemp1, iTemp3); iTemp7 = _mm_add_ps(iTemp1, rTemp3);
  166. // calculating Result
  167. // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial
  168. vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
  169. vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]);
  170. vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]);
  171. if (fLast) {
  172. ButterflyDIT4_1(rTemp4, iTemp4);
  173. ButterflyDIT4_1(rTemp5, iTemp5);
  174. ButterflyDIT4_1(rTemp6, iTemp6);
  175. ButterflyDIT4_1(rTemp7, iTemp7);
  176. }
  177. r0 = rTemp4; i0 = iTemp4;
  178. r1 = rTemp5; i1 = iTemp5;
  179. r2 = rTemp6; i2 = iTemp6;
  180. r3 = rTemp7; i3 = iTemp7;
  181. }
  182. //--------------<F-U-N-C-T-I-O-N-S>-----------------------------------------//
  183. ////
  184. // DESCRIPTION:
  185. // 4-sample FFT.
  186. //
  187. // PARAMETERS:
  188. // pReal - [inout] real components, must have at least uCount elements
  189. // pImaginary - [inout] imaginary components, must have at least uCount elements
  190. // uCount - [in] number of FFT iterations
  191. //
  192. // RETURN VALUE:
  193. // void
  194. ////
  195. __forceinline void FFT4 (__inout_ecount(uCount) XVECTOR* __restrict pReal, __inout_ecount(uCount) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
  196. {
  197. DSPASSERT(pReal != NULL);
  198. DSPASSERT(pImaginary != NULL);
  199. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  200. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  201. DSPASSERT(ISPOWEROF2(uCount));
  202. for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
  203. ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]);
  204. }
  205. }
  206. ////
  207. // DESCRIPTION:
  208. // 8-sample FFT.
  209. //
  210. // PARAMETERS:
  211. // pReal - [inout] real components, must have at least uCount*2 elements
  212. // pImaginary - [inout] imaginary components, must have at least uCount*2 elements
  213. // uCount - [in] number of FFT iterations
  214. //
  215. // RETURN VALUE:
  216. // void
  217. ////
  218. __forceinline void FFT8 (__inout_ecount(uCount*2) XVECTOR* __restrict pReal, __inout_ecount(uCount*2) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
  219. {
  220. DSPASSERT(pReal != NULL);
  221. DSPASSERT(pImaginary != NULL);
  222. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  223. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  224. DSPASSERT(ISPOWEROF2(uCount));
  225. static XVECTOR wr1 = { 1.0f, 0.707168f, 0.0f, -0.707168f };
  226. static XVECTOR wi1 = { 0.0f, -0.707168f, -1.0f, -0.707168f };
  227. static XVECTOR wr2 = { -1.0f, -0.707168f, 0.0f, 0.707168f };
  228. static XVECTOR wi2 = { 0.0f, 0.707168f, 1.0f, 0.707168f };
  229. for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
  230. XVECTOR* __restrict pR = pReal + uIndex*2;
  231. XVECTOR* __restrict pI = pImaginary + uIndex*2;
  232. XVECTOR oddsR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(3, 1, 3, 1));
  233. XVECTOR evensR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(2, 0, 2, 0));
  234. XVECTOR oddsI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(3, 1, 3, 1));
  235. XVECTOR evensI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(2, 0, 2, 0));
  236. ButterflyDIT4_1(oddsR, oddsI);
  237. ButterflyDIT4_1(evensR, evensI);
  238. XVECTOR r, i;
  239. vmulComplex(r, i, oddsR, oddsI, wr1, wi1);
  240. pR[0] = _mm_add_ps(evensR, r);
  241. pI[0] = _mm_add_ps(evensI, i);
  242. vmulComplex(r, i, oddsR, oddsI, wr2, wi2);
  243. pR[1] = _mm_add_ps(evensR, r);
  244. pI[1] = _mm_add_ps(evensI, i);
  245. }
  246. }
  247. ////
  248. // DESCRIPTION:
  249. // 16-sample FFT.
  250. //
  251. // PARAMETERS:
  252. // pReal - [inout] real components, must have at least uCount*4 elements
  253. // pImaginary - [inout] imaginary components, must have at least uCount*4 elements
  254. // uCount - [in] number of FFT iterations
  255. //
  256. // RETURN VALUE:
  257. // void
  258. ////
  259. __forceinline void FFT16 (__inout_ecount(uCount*4) XVECTOR* __restrict pReal, __inout_ecount(uCount*4) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
  260. {
  261. DSPASSERT(pReal != NULL);
  262. DSPASSERT(pImaginary != NULL);
  263. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  264. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  265. DSPASSERT(ISPOWEROF2(uCount));
  266. XVECTOR aUnityTableReal[4] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.92387950f, 0.70710677f, 0.38268343f, 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f, 1.0f, 0.38268343f, -0.70710677f, -0.92387950f };
  267. XVECTOR aUnityTableImaginary[4] = { -0.0f, -0.0f, -0.0f, -0.0f, -0.0f, -0.38268343f, -0.70710677f, -0.92387950f, -0.0f, -0.70710677f, -1.0f, -0.70710677f, -0.0f, -0.92387950f, -0.70710677f, 0.38268343f };
  268. for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
  269. ButterflyDIT4_4(pReal[uIndex*4],
  270. pReal[uIndex*4 + 1],
  271. pReal[uIndex*4 + 2],
  272. pReal[uIndex*4 + 3],
  273. pImaginary[uIndex*4],
  274. pImaginary[uIndex*4 + 1],
  275. pImaginary[uIndex*4 + 2],
  276. pImaginary[uIndex*4 + 3],
  277. aUnityTableReal,
  278. aUnityTableImaginary,
  279. 1, TRUE);
  280. }
  281. }
  282. ////
  283. // DESCRIPTION:
  284. // 2^N-sample FFT.
  285. //
  286. // REMARKS:
  287. // For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4().
  288. //
  289. // PARAMETERS:
  290. // pReal - [inout] real components, must have at least (uLength*uCount)/4 elements
  291. // pImaginary - [inout] imaginary components, must have at least (uLength*uCount)/4 elements
  292. // pUnityTable - [in] unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable()
  293. // uLength - [in] FFT length in samples, must be a power of 2 > 16
  294. // uCount - [in] number of FFT iterations
  295. //
  296. // RETURN VALUE:
  297. // void
  298. ////
  299. inline void FFT (__inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pReal, __inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(uLength*uCount) XVECTOR* __restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1)
  300. {
  301. DSPASSERT(pReal != NULL);
  302. DSPASSERT(pImaginary != NULL);
  303. DSPASSERT(pUnityTable != NULL);
  304. DSPASSERT((UINT_PTR)pReal % 16 == 0);
  305. DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  306. DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
  307. DSPASSERT(uLength > 16);
  308. DSPASSERT(ISPOWEROF2(uLength));
  309. DSPASSERT(ISPOWEROF2(uCount));
  310. XVECTOR* __restrict pUnityTableReal = pUnityTable;
  311. XVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2);
  312. const UINT32 uTotal = uCount * uLength;
  313. const UINT32 uTotal_vectors = uTotal >> 2;
  314. const UINT32 uStage_vectors = uLength >> 2;
  315. const UINT32 uStride = uStage_vectors >> 2; // stride between butterfly elements
  316. const UINT32 uSkip = uStage_vectors - uStride;
  317. for (UINT32 uIndex=0; uIndex<(uTotal_vectors>>2); ++uIndex) {
  318. UINT32 n = (uIndex/uStride) * (uStride + uSkip) + (uIndex % uStride);
  319. ButterflyDIT4_4(pReal[n],
  320. pReal[n + uStride],
  321. pReal[n + uStride*2],
  322. pReal[n + uStride*3],
  323. pImaginary[n ],
  324. pImaginary[n + uStride],
  325. pImaginary[n + uStride*2],
  326. pImaginary[n + uStride*3],
  327. pUnityTableReal + n % uStage_vectors,
  328. pUnityTableImaginary + n % uStage_vectors,
  329. uStride, FALSE);
  330. }
  331. if (uLength > 16*4) {
  332. FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4);
  333. } else if (uLength == 16*4) {
  334. FFT16(pReal, pImaginary, uCount*4);
  335. } else if (uLength == 8*4) {
  336. FFT8(pReal, pImaginary, uCount*4);
  337. } else if (uLength == 4*4) {
  338. FFT4(pReal, pImaginary, uCount*4);
  339. }
  340. }
  341. //--------------------------------------------------------------------------//
  342. ////
  343. // DESCRIPTION:
  344. // Initializes unity roots lookup table used by FFT functions.
  345. // Once initialized, the table need not be initialized again unless a
  346. // different FFT length is desired.
  347. //
  348. // REMARKS:
  349. // The unity tables of FFT length 16 and below are hard coded into the
  350. // respective FFT functions and so need not be initialized.
  351. //
  352. // PARAMETERS:
  353. // pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength XVECTORs
  354. // uLength - [in] FFT length in samples, must be a power of 2 > 16
  355. //
  356. // RETURN VALUE:
  357. // void
  358. ////
  359. inline void FFTInitializeUnityTable (__out_bcount(uLength*sizeof(XVECTOR)) FLOAT32* __restrict pUnityTable, UINT32 uLength)
  360. {
  361. DSPASSERT(pUnityTable != NULL);
  362. DSPASSERT(uLength > 16);
  363. DSPASSERT(ISPOWEROF2(uLength));
  364. // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16
  365. do {
  366. FLOAT32 flStep = 6.283185307f / uLength; // 2PI / FFT length
  367. uLength >>= 2;
  368. // pUnityTable[0 to uLength*4-1] contains real components for current FFT length
  369. // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length
  370. for (UINT32 i=0; i<4; ++i) {
  371. for (UINT32 j=0; j<uLength; ++j) {
  372. UINT32 uIndex = (i*uLength) + j;
  373. pUnityTable[uIndex] = cosf(FLOAT32(i)*FLOAT32(j)*flStep); // real component
  374. pUnityTable[uIndex + uLength*4] = -sinf(FLOAT32(i)*FLOAT32(j)*flStep); // imaginary component
  375. }
  376. }
  377. pUnityTable += uLength*8;
  378. } while (uLength > 16);
  379. }
  380. ////
  381. // DESCRIPTION:
  382. // The FFT functions generate output in bit reversed order.
  383. // Use this function to re-arrange them into order of increasing frequency.
  384. //
  385. // PARAMETERS:
  386. // pOutput - [out] output buffer, receives samples in order of increasing frequency, must have at least (1<<uLog2Length) elements
  387. // pInput - [in] input buffer, samples in bit reversed order as generated by FFT functions, must have at least (1<<uLog2Length) elements
  388. // uLog2Length - [in] LOG (base 2) of FFT length in samples, must be > 0
  389. //
  390. // RETURN VALUE:
  391. // void
  392. ////
  393. inline void FFTUnswizzle (__out_ecount(1<<uLog2Length) FLOAT32* __restrict pOutput, __in_ecount(1<<uLog2Length) const FLOAT32* __restrict pInput, UINT32 uLog2Length)
  394. {
  395. DSPASSERT(pOutput != NULL);
  396. DSPASSERT(pInput != NULL);
  397. DSPASSERT(uLog2Length > 0);
  398. UINT32 uLength = UINT32(1 << uLog2Length);
  399. if ((uLog2Length & 0x1) == 0) {
  400. // even powers of two
  401. for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
  402. UINT32 n = uIndex;
  403. n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 );
  404. n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 );
  405. n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 );
  406. n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
  407. n >>= (32 - uLog2Length);
  408. pOutput[n] = pInput[uIndex];
  409. }
  410. } else {
  411. // odd powers of two
  412. for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
  413. UINT32 n = (uIndex>>3);
  414. n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 );
  415. n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 );
  416. n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 );
  417. n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
  418. n >>= (32 - (uLog2Length-3));
  419. n |= ((uIndex & 0x7) << (uLog2Length - 3));
  420. pOutput[n] = pInput[uIndex];
  421. }
  422. }
  423. }
  424. ////
  425. // DESCRIPTION:
  426. // Convert complex components to polar form.
  427. //
  428. // PARAMETERS:
  429. // pOutput - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements
  430. // pInputReal - [in] input buffer (real components), must have at least uLength/4 elements
  431. // pInputImaginary - [in] input buffer (imaginary components), must have at least uLength/4 elements
  432. // uLength - [in] FFT length in samples, must be a power of 2 >= 4
  433. //
  434. // RETURN VALUE:
  435. // void
  436. ////
  437. inline void FFTPolar (__out_ecount(uLength/4) XVECTOR* __restrict pOutput, __in_ecount(uLength/4) const XVECTOR* __restrict pInputReal, __in_ecount(uLength/4) const XVECTOR* __restrict pInputImaginary, UINT32 uLength)
  438. {
  439. DSPASSERT(pOutput != NULL);
  440. DSPASSERT(pInputReal != NULL);
  441. DSPASSERT(pInputImaginary != NULL);
  442. DSPASSERT(uLength >= 4);
  443. DSPASSERT(ISPOWEROF2(uLength));
  444. FLOAT32 flOneOverLength = 1.0f / uLength;
  445. // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2
  446. XVECTOR vOneOverLength = _mm_set_ps1(flOneOverLength);
  447. for (UINT32 uIndex=0; uIndex<(uLength>>2); ++uIndex) {
  448. XVECTOR vReal = _mm_mul_ps(pInputReal[uIndex], vOneOverLength);
  449. XVECTOR vImaginary = _mm_mul_ps(pInputImaginary[uIndex], vOneOverLength);
  450. XVECTOR vRR = _mm_mul_ps(vReal, vReal);
  451. XVECTOR vII = _mm_mul_ps(vImaginary, vImaginary);
  452. XVECTOR vRRplusII = _mm_add_ps(vRR, vII);
  453. XVECTOR vTotal = _mm_sqrt_ps(vRRplusII);
  454. pOutput[uIndex] = _mm_add_ps(vTotal, vTotal);
  455. }
  456. }
  457. #pragma warning(pop)
  458. }; // namespace XDSP
  459. //---------------------------------<-EOF->----------------------------------//