Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

5785 lines
185 KiB

  1. /*++
  2. Copyright (c) Microsoft Corporation. All rights reserved.
  3. Module Name:
  4. xnamathconvert.inl
  5. Abstract:
  6. XNA math library for Windows and Xbox 360: Conversion, loading, and storing functions.
  7. --*/
  8. #if defined(_MSC_VER) && (_MSC_VER > 1000)
  9. #pragma once
  10. #endif
  11. #ifndef __XNAMATHCONVERT_INL__
  12. #define __XNAMATHCONVERT_INL__
  13. #define XM_PACK_FACTOR (FLOAT)(1 << 22)
  14. #define XM_UNPACK_FACTOR_UNSIGNED (FLOAT)(1 << 23)
  15. #define XM_UNPACK_FACTOR_SIGNED XM_PACK_FACTOR
  16. #define XM_UNPACK_UNSIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
  17. {-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
  18. -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
  19. -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
  20. -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
  21. #define XM_UNPACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
  22. {XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
  23. XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
  24. XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
  25. XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
  26. #define XM_UNPACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
  27. {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1), \
  28. -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1), \
  29. -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1), \
  30. -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1)}
  31. //#define XM_UNPACK_SIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
  32. // {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1) * 3.0f, \
  33. // -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1) * 3.0f, \
  34. // -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1) * 3.0f, \
  35. // -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1) * 3.0f}
  36. #define XM_PACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
  37. {-(FLOAT)((1 << (BitsX)) - 1) / XM_PACK_FACTOR, \
  38. -(FLOAT)((1 << (BitsY)) - 1) / XM_PACK_FACTOR, \
  39. -(FLOAT)((1 << (BitsZ)) - 1) / XM_PACK_FACTOR, \
  40. -(FLOAT)((1 << (BitsW)) - 1) / XM_PACK_FACTOR}
  41. #define XM_PACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
  42. {-(FLOAT)((1 << ((BitsX) - 1)) - 1) / XM_PACK_FACTOR, \
  43. -(FLOAT)((1 << ((BitsY) - 1)) - 1) / XM_PACK_FACTOR, \
  44. -(FLOAT)((1 << ((BitsZ) - 1)) - 1) / XM_PACK_FACTOR, \
  45. -(FLOAT)((1 << ((BitsW) - 1)) - 1) / XM_PACK_FACTOR}
  46. #define XM_PACK_OFFSET XMVectorSplatConstant(3, 0)
  47. //#define XM_UNPACK_OFFSET XM_PACK_OFFSET
  48. /****************************************************************************
  49. *
  50. * Data conversion
  51. *
  52. ****************************************************************************/
  53. //------------------------------------------------------------------------------
  54. XMFINLINE FLOAT XMConvertHalfToFloat
  55. (
  56. HALF Value
  57. )
  58. {
  59. #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
  60. UINT Mantissa;
  61. UINT Exponent;
  62. UINT Result;
  63. Mantissa = (UINT)(Value & 0x03FF);
  64. if ((Value & 0x7C00) != 0) // The value is normalized
  65. {
  66. Exponent = (UINT)((Value >> 10) & 0x1F);
  67. }
  68. else if (Mantissa != 0) // The value is denormalized
  69. {
  70. // Normalize the value in the resulting float
  71. Exponent = 1;
  72. do
  73. {
  74. Exponent--;
  75. Mantissa <<= 1;
  76. } while ((Mantissa & 0x0400) == 0);
  77. Mantissa &= 0x03FF;
  78. }
  79. else // The value is zero
  80. {
  81. Exponent = (UINT)-112;
  82. }
  83. Result = ((Value & 0x8000) << 16) | // Sign
  84. ((Exponent + 112) << 23) | // Exponent
  85. (Mantissa << 13); // Mantissa
  86. return *(FLOAT*)&Result;
  87. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  88. #endif
  89. }
  90. //------------------------------------------------------------------------------
  91. XMINLINE FLOAT* XMConvertHalfToFloatStream
  92. (
  93. FLOAT* pOutputStream,
  94. UINT OutputStride,
  95. CONST HALF* pInputStream,
  96. UINT InputStride,
  97. UINT HalfCount
  98. )
  99. {
  100. #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
  101. UINT i;
  102. BYTE* pHalf = (BYTE*)pInputStream;
  103. BYTE* pFloat = (BYTE*)pOutputStream;
  104. XMASSERT(pOutputStream);
  105. XMASSERT(pInputStream);
  106. for (i = 0; i < HalfCount; i++)
  107. {
  108. *(FLOAT*)pFloat = XMConvertHalfToFloat(*(HALF*)pHalf);
  109. pHalf += InputStride;
  110. pFloat += OutputStride;
  111. }
  112. return pOutputStream;
  113. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  114. #endif // _XM_VMX128_INTRINSICS_
  115. }
  116. //------------------------------------------------------------------------------
  117. XMFINLINE HALF XMConvertFloatToHalf
  118. (
  119. FLOAT Value
  120. )
  121. {
  122. #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
  123. UINT Result;
  124. UINT IValue = ((UINT *)(&Value))[0];
  125. UINT Sign = (IValue & 0x80000000U) >> 16U;
  126. IValue = IValue & 0x7FFFFFFFU; // Hack off the sign
  127. if (IValue > 0x47FFEFFFU)
  128. {
  129. // The number is too large to be represented as a half. Saturate to infinity.
  130. Result = 0x7FFFU;
  131. }
  132. else
  133. {
  134. if (IValue < 0x38800000U)
  135. {
  136. // The number is too small to be represented as a normalized half.
  137. // Convert it to a denormalized value.
  138. UINT Shift = 113U - (IValue >> 23U);
  139. IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
  140. }
  141. else
  142. {
  143. // Rebias the exponent to represent the value as a normalized half.
  144. IValue += 0xC8000000U;
  145. }
  146. Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU;
  147. }
  148. return (HALF)(Result|Sign);
  149. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  150. #endif
  151. }
  152. //------------------------------------------------------------------------------
  153. XMINLINE HALF* XMConvertFloatToHalfStream
  154. (
  155. HALF* pOutputStream,
  156. UINT OutputStride,
  157. CONST FLOAT* pInputStream,
  158. UINT InputStride,
  159. UINT FloatCount
  160. )
  161. {
  162. #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
  163. UINT i;
  164. BYTE* pFloat = (BYTE*)pInputStream;
  165. BYTE* pHalf = (BYTE*)pOutputStream;
  166. XMASSERT(pOutputStream);
  167. XMASSERT(pInputStream);
  168. for (i = 0; i < FloatCount; i++)
  169. {
  170. *(HALF*)pHalf = XMConvertFloatToHalf(*(FLOAT*)pFloat);
  171. pFloat += InputStride;
  172. pHalf += OutputStride;
  173. }
  174. return pOutputStream;
  175. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  176. #endif // _XM_VMX128_INTRINSICS_
  177. }
  178. //------------------------------------------------------------------------------
  179. #if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
  180. // For VMX128, these routines are all defines in the main header
  181. #pragma warning(push)
  182. #pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized
  183. XMINLINE XMVECTOR XMConvertVectorIntToFloat
  184. (
  185. FXMVECTOR VInt,
  186. UINT DivExponent
  187. )
  188. {
  189. #if defined(_XM_NO_INTRINSICS_)
  190. UINT ElementIndex;
  191. FLOAT fScale;
  192. XMVECTOR Result;
  193. XMASSERT(DivExponent<32);
  194. fScale = 1.0f / (FLOAT)(1U << DivExponent);
  195. ElementIndex = 0;
  196. do {
  197. INT iTemp = (INT)VInt.vector4_u32[ElementIndex];
  198. Result.vector4_f32[ElementIndex] = ((FLOAT)iTemp) * fScale;
  199. } while (++ElementIndex<4);
  200. return Result;
  201. #else // _XM_SSE_INTRINSICS_
  202. XMASSERT(DivExponent<32);
  203. // Convert to floats
  204. XMVECTOR vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&VInt)[0]);
  205. // Convert DivExponent into 1.0f/(1<<DivExponent)
  206. UINT uScale = 0x3F800000U - (DivExponent << 23);
  207. // Splat the scalar value
  208. __m128i vScale = _mm_set1_epi32(uScale);
  209. vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
  210. return vResult;
  211. #endif
  212. }
  213. //------------------------------------------------------------------------------
  214. XMINLINE XMVECTOR XMConvertVectorFloatToInt
  215. (
  216. FXMVECTOR VFloat,
  217. UINT MulExponent
  218. )
  219. {
  220. #if defined(_XM_NO_INTRINSICS_)
  221. UINT ElementIndex;
  222. XMVECTOR Result;
  223. FLOAT fScale;
  224. XMASSERT(MulExponent<32);
  225. // Get the scalar factor.
  226. fScale = (FLOAT)(1U << MulExponent);
  227. ElementIndex = 0;
  228. do {
  229. INT iResult;
  230. FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
  231. if (fTemp <= -(65536.0f*32768.0f)) {
  232. iResult = (-0x7FFFFFFF)-1;
  233. } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
  234. iResult = 0x7FFFFFFF;
  235. } else {
  236. iResult = (INT)fTemp;
  237. }
  238. Result.vector4_u32[ElementIndex] = (UINT)iResult;
  239. } while (++ElementIndex<4);
  240. return Result;
  241. #else // _XM_SSE_INTRINSICS_
  242. XMASSERT(MulExponent<32);
  243. static const XMVECTORF32 MaxInt = {65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f};
  244. XMVECTOR vResult = _mm_set_ps1((FLOAT)(1U << MulExponent));
  245. vResult = _mm_mul_ps(vResult,VFloat);
  246. // In case of positive overflow, detect it
  247. XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,MaxInt);
  248. // Float to int conversion
  249. __m128i vResulti = _mm_cvttps_epi32(vResult);
  250. // If there was positive overflow, set to 0x7FFFFFFF
  251. vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
  252. vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
  253. vOverflow = _mm_or_ps(vOverflow,vResult);
  254. return vOverflow;
  255. #endif
  256. }
  257. //------------------------------------------------------------------------------
  258. XMINLINE XMVECTOR XMConvertVectorUIntToFloat
  259. (
  260. FXMVECTOR VUInt,
  261. UINT DivExponent
  262. )
  263. {
  264. #if defined(_XM_NO_INTRINSICS_)
  265. UINT ElementIndex;
  266. FLOAT fScale;
  267. XMVECTOR Result;
  268. XMASSERT(DivExponent<32);
  269. fScale = 1.0f / (FLOAT)(1U << DivExponent);
  270. ElementIndex = 0;
  271. do {
  272. Result.vector4_f32[ElementIndex] = (FLOAT)VUInt.vector4_u32[ElementIndex] * fScale;
  273. } while (++ElementIndex<4);
  274. return Result;
  275. #else // _XM_SSE_INTRINSICS_
  276. XMASSERT(DivExponent<32);
  277. static const XMVECTORF32 FixUnsigned = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
  278. // For the values that are higher than 0x7FFFFFFF, a fixup is needed
  279. // Determine which ones need the fix.
  280. XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
  281. // Force all values positive
  282. XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
  283. // Convert to floats
  284. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  285. // Convert 0x80000000 -> 0xFFFFFFFF
  286. __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
  287. // For only the ones that are too big, add the fixup
  288. vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],FixUnsigned);
  289. vResult = _mm_add_ps(vResult,vMask);
  290. // Convert DivExponent into 1.0f/(1<<DivExponent)
  291. UINT uScale = 0x3F800000U - (DivExponent << 23);
  292. // Splat
  293. iMask = _mm_set1_epi32(uScale);
  294. vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&iMask)[0]);
  295. return vResult;
  296. #endif
  297. }
  298. //------------------------------------------------------------------------------
  299. XMINLINE XMVECTOR XMConvertVectorFloatToUInt
  300. (
  301. FXMVECTOR VFloat,
  302. UINT MulExponent
  303. )
  304. {
  305. #if defined(_XM_NO_INTRINSICS_)
  306. UINT ElementIndex;
  307. XMVECTOR Result;
  308. FLOAT fScale;
  309. XMASSERT(MulExponent<32);
  310. // Get the scalar factor.
  311. fScale = (FLOAT)(1U << MulExponent);
  312. ElementIndex = 0;
  313. do {
  314. UINT uResult;
  315. FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
  316. if (fTemp <= 0.0f) {
  317. uResult = 0;
  318. } else if (fTemp >= (65536.0f*65536.0f)) {
  319. uResult = 0xFFFFFFFFU;
  320. } else {
  321. uResult = (UINT)fTemp;
  322. }
  323. Result.vector4_u32[ElementIndex] = uResult;
  324. } while (++ElementIndex<4);
  325. return Result;
  326. #else // _XM_SSE_INTRINSICS_
  327. XMASSERT(MulExponent<32);
  328. static const XMVECTORF32 MaxUInt = {65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f};
  329. static const XMVECTORF32 UnsignedFix = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
  330. XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
  331. vResult = _mm_mul_ps(vResult,VFloat);
  332. // Clamp to >=0
  333. vResult = _mm_max_ps(vResult,g_XMZero);
  334. // Any numbers that are too big, set to 0xFFFFFFFFU
  335. XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,MaxUInt);
  336. XMVECTOR vValue = UnsignedFix;
  337. // Too large for a signed integer?
  338. XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
  339. // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
  340. vValue = _mm_and_ps(vValue,vMask);
  341. // Perform fixup only on numbers too large (Keeps low bit precision)
  342. vResult = _mm_sub_ps(vResult,vValue);
  343. __m128i vResulti = _mm_cvttps_epi32(vResult);
  344. // Convert from signed to unsigned pnly if greater than 0x80000000
  345. vMask = _mm_and_ps(vMask,g_XMNegativeZero);
  346. vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
  347. // On those that are too large, set to 0xFFFFFFFF
  348. vResult = _mm_or_ps(vResult,vOverflow);
  349. return vResult;
  350. #endif
  351. }
  352. #pragma warning(pop)
  353. #endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_
  354. /****************************************************************************
  355. *
  356. * Vector and matrix load operations
  357. *
  358. ****************************************************************************/
  359. //------------------------------------------------------------------------------
  360. XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource)
  361. {
  362. #if defined(_XM_NO_INTRINSICS_)
  363. XMVECTOR V;
  364. XMASSERT(pSource);
  365. XMASSERT(((UINT_PTR)pSource & 3) == 0);
  366. V.vector4_u32[0] = *pSource;
  367. return V;
  368. #elif defined(_XM_SSE_INTRINSICS_)
  369. XMASSERT(pSource);
  370. XMASSERT(((UINT_PTR)pSource & 3) == 0);
  371. return _mm_load_ss( (const float*)pSource );
  372. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  373. #endif // _XM_VMX128_INTRINSICS_
  374. }
  375. //------------------------------------------------------------------------------
  376. XMFINLINE XMVECTOR XMLoadFloat(CONST FLOAT* pSource)
  377. {
  378. #if defined(_XM_NO_INTRINSICS_)
  379. XMVECTOR V;
  380. XMASSERT(pSource);
  381. XMASSERT(((UINT_PTR)pSource & 3) == 0);
  382. V.vector4_f32[0] = *pSource;
  383. return V;
  384. #elif defined(_XM_SSE_INTRINSICS_)
  385. XMASSERT(pSource);
  386. XMASSERT(((UINT_PTR)pSource & 3) == 0);
  387. return _mm_load_ss( pSource );
  388. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  389. #endif // _XM_VMX128_INTRINSICS_
  390. }
  391. //------------------------------------------------------------------------------
  392. XMFINLINE XMVECTOR XMLoadInt2
  393. (
  394. CONST UINT* pSource
  395. )
  396. {
  397. #if defined(_XM_NO_INTRINSICS_)
  398. XMVECTOR V;
  399. XMASSERT(pSource);
  400. V.vector4_u32[0] = pSource[0];
  401. V.vector4_u32[1] = pSource[1];
  402. return V;
  403. #elif defined(_XM_SSE_INTRINSICS_)
  404. XMASSERT(pSource);
  405. __m128 x = _mm_load_ss( (const float*)pSource );
  406. __m128 y = _mm_load_ss( (const float*)(pSource+1) );
  407. return _mm_unpacklo_ps( x, y );
  408. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  409. #endif // _XM_VMX128_INTRINSICS_
  410. }
  411. //------------------------------------------------------------------------------
  412. XMFINLINE XMVECTOR XMLoadInt2A
  413. (
  414. CONST UINT* pSource
  415. )
  416. {
  417. #if defined(_XM_NO_INTRINSICS_)
  418. XMVECTOR V;
  419. XMASSERT(pSource);
  420. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  421. V.vector4_u32[0] = pSource[0];
  422. V.vector4_u32[1] = pSource[1];
  423. return V;
  424. #elif defined(_XM_SSE_INTRINSICS_)
  425. XMASSERT(pSource);
  426. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  427. __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
  428. return reinterpret_cast<__m128 *>(&V)[0];
  429. #else // _XM_VMX128_INTRINSICS_
  430. #endif // _XM_VMX128_INTRINSICS_
  431. }
  432. //------------------------------------------------------------------------------
  433. XMFINLINE XMVECTOR XMLoadFloat2
  434. (
  435. CONST XMFLOAT2* pSource
  436. )
  437. {
  438. #if defined(_XM_NO_INTRINSICS_)
  439. XMVECTOR V;
  440. XMASSERT(pSource);
  441. ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
  442. ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
  443. return V;
  444. #elif defined(_XM_SSE_INTRINSICS_)
  445. XMASSERT(pSource);
  446. __m128 x = _mm_load_ss( &pSource->x );
  447. __m128 y = _mm_load_ss( &pSource->y );
  448. return _mm_unpacklo_ps( x, y );
  449. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  450. #endif // _XM_VMX128_INTRINSICS_
  451. }
  452. //------------------------------------------------------------------------------
  453. XMFINLINE XMVECTOR XMLoadFloat2A
  454. (
  455. CONST XMFLOAT2A* pSource
  456. )
  457. {
  458. #if defined(_XM_NO_INTRINSICS_)
  459. XMVECTOR V;
  460. XMASSERT(pSource);
  461. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  462. V.vector4_f32[0] = pSource->x;
  463. V.vector4_f32[1] = pSource->y;
  464. return V;
  465. #elif defined(_XM_SSE_INTRINSICS_)
  466. XMASSERT(pSource);
  467. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  468. __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
  469. return reinterpret_cast<__m128 *>(&V)[0];
  470. #else // _XM_VMX128_INTRINSICS_
  471. #endif // _XM_VMX128_INTRINSICS_
  472. }
  473. //------------------------------------------------------------------------------
  474. XMFINLINE XMVECTOR XMLoadHalf2
  475. (
  476. CONST XMHALF2* pSource
  477. )
  478. {
  479. #if defined(_XM_NO_INTRINSICS_)
  480. XMASSERT(pSource);
  481. {
  482. XMVECTOR vResult = {
  483. XMConvertHalfToFloat(pSource->x),
  484. XMConvertHalfToFloat(pSource->y),
  485. 0.0f,
  486. 0.0f
  487. };
  488. return vResult;
  489. }
  490. #elif defined(_XM_SSE_INTRINSICS_)
  491. XMASSERT(pSource);
  492. XMVECTOR vResult = {
  493. XMConvertHalfToFloat(pSource->x),
  494. XMConvertHalfToFloat(pSource->y),
  495. 0.0f,
  496. 0.0f
  497. };
  498. return vResult;
  499. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  500. #endif // _XM_VMX128_INTRINSICS_
  501. }
  502. //------------------------------------------------------------------------------
  503. XMFINLINE XMVECTOR XMLoadShortN2
  504. (
  505. CONST XMSHORTN2* pSource
  506. )
  507. {
  508. #if defined(_XM_NO_INTRINSICS_)
  509. XMASSERT(pSource);
  510. XMASSERT(pSource->x != -32768);
  511. XMASSERT(pSource->y != -32768);
  512. {
  513. XMVECTOR vResult = {
  514. (FLOAT)pSource->x * (1.0f/32767.0f),
  515. (FLOAT)pSource->y * (1.0f/32767.0f),
  516. 0.0f,
  517. 0.0f
  518. };
  519. return vResult;
  520. }
  521. #elif defined(_XM_SSE_INTRINSICS_)
  522. XMASSERT(pSource);
  523. XMASSERT(pSource->x != -32768);
  524. XMASSERT(pSource->y != -32768);
  525. // Splat the two shorts in all four entries (WORD alignment okay,
  526. // DWORD alignment preferred)
  527. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
  528. // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
  529. vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
  530. // x needs to be sign extended
  531. vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
  532. // Convert to floating point numbers
  533. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  534. // x - 0x8000 to undo the signed order.
  535. vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
  536. // Convert 0-32767 to 0.0f-1.0f
  537. return _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
  538. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  539. #endif // _XM_VMX128_INTRINSICS_
  540. }
  541. //------------------------------------------------------------------------------
  542. XMFINLINE XMVECTOR XMLoadShort2
  543. (
  544. CONST XMSHORT2* pSource
  545. )
  546. {
  547. #if defined(_XM_NO_INTRINSICS_)
  548. XMVECTOR V;
  549. XMASSERT(pSource);
  550. XMASSERT(pSource->x != -32768);
  551. XMASSERT(pSource->y != -32768);
  552. V.vector4_f32[0] = (FLOAT)pSource->x;
  553. V.vector4_f32[1] = (FLOAT)pSource->y;
  554. return V;
  555. #elif defined(_XM_SSE_INTRINSICS_)
  556. XMASSERT(pSource);
  557. XMASSERT(pSource->x != -32768);
  558. XMASSERT(pSource->y != -32768);
  559. // Splat the two shorts in all four entries (WORD alignment okay,
  560. // DWORD alignment preferred)
  561. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
  562. // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
  563. vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
  564. // x needs to be sign extended
  565. vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
  566. // Convert to floating point numbers
  567. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  568. // x - 0x8000 to undo the signed order.
  569. vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
  570. // Y is 65536 too large
  571. return _mm_mul_ps(vTemp,g_XMFixupY16);
  572. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  573. #endif // _XM_VMX128_INTRINSICS_
  574. }
  575. //------------------------------------------------------------------------------
  576. XMFINLINE XMVECTOR XMLoadUShortN2
  577. (
  578. CONST XMUSHORTN2* pSource
  579. )
  580. {
  581. #if defined(_XM_NO_INTRINSICS_)
  582. XMVECTOR V;
  583. XMASSERT(pSource);
  584. V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
  585. V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
  586. return V;
  587. #elif defined(_XM_SSE_INTRINSICS_)
  588. static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f};
  589. static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0};
  590. XMASSERT(pSource);
  591. // Splat the two shorts in all four entries (WORD alignment okay,
  592. // DWORD alignment preferred)
  593. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
  594. // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
  595. vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
  596. // y needs to be sign flipped
  597. vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
  598. // Convert to floating point numbers
  599. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  600. // y + 0x8000 to undo the signed order.
  601. vTemp = _mm_add_ps(vTemp,FixaddY16);
  602. // Y is 65536 times too large
  603. vTemp = _mm_mul_ps(vTemp,FixupY16);
  604. return vTemp;
  605. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  606. #endif // _XM_VMX128_INTRINSICS_
  607. }
  608. //------------------------------------------------------------------------------
  609. XMFINLINE XMVECTOR XMLoadUShort2
  610. (
  611. CONST XMUSHORT2* pSource
  612. )
  613. {
  614. #if defined(_XM_NO_INTRINSICS_)
  615. XMVECTOR V;
  616. XMASSERT(pSource);
  617. V.vector4_f32[0] = (FLOAT)pSource->x;
  618. V.vector4_f32[1] = (FLOAT)pSource->y;
  619. return V;
  620. #elif defined(_XM_SSE_INTRINSICS_)
  621. static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0};
  622. XMASSERT(pSource);
  623. // Splat the two shorts in all four entries (WORD alignment okay,
  624. // DWORD alignment preferred)
  625. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
  626. // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
  627. vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
  628. // y needs to be sign flipped
  629. vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
  630. // Convert to floating point numbers
  631. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  632. // Y is 65536 times too large
  633. vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
  634. // y + 0x8000 to undo the signed order.
  635. vTemp = _mm_add_ps(vTemp,FixaddY16);
  636. return vTemp;
  637. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  638. #endif // _XM_VMX128_INTRINSICS_
  639. }
  640. //------------------------------------------------------------------------------
  641. XMFINLINE XMVECTOR XMLoadInt3
  642. (
  643. CONST UINT* pSource
  644. )
  645. {
  646. #if defined(_XM_NO_INTRINSICS_)
  647. XMVECTOR V;
  648. XMASSERT(pSource);
  649. V.vector4_u32[0] = pSource[0];
  650. V.vector4_u32[1] = pSource[1];
  651. V.vector4_u32[2] = pSource[2];
  652. return V;
  653. #elif defined(_XM_SSE_INTRINSICS_)
  654. XMASSERT(pSource);
  655. #ifdef _XM_ISVS2005_
  656. __m128i V = _mm_set_epi32( 0, *(pSource+2), *(pSource+1), *pSource );
  657. return reinterpret_cast<__m128 *>(&V)[0];
  658. #else
  659. __m128 x = _mm_load_ss( (const float*)pSource );
  660. __m128 y = _mm_load_ss( (const float*)(pSource+1) );
  661. __m128 z = _mm_load_ss( (const float*)(pSource+2) );
  662. __m128 xy = _mm_unpacklo_ps( x, y );
  663. return _mm_movelh_ps( xy, z );
  664. #endif // !_XM_ISVS2005_
  665. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  666. #endif // _XM_VMX128_INTRINSICS_
  667. }
  668. //------------------------------------------------------------------------------
  669. XMFINLINE XMVECTOR XMLoadInt3A
  670. (
  671. CONST UINT* pSource
  672. )
  673. {
  674. #if defined(_XM_NO_INTRINSICS_)
  675. XMVECTOR V;
  676. XMASSERT(pSource);
  677. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  678. V.vector4_u32[0] = pSource[0];
  679. V.vector4_u32[1] = pSource[1];
  680. V.vector4_u32[2] = pSource[2];
  681. return V;
  682. #elif defined(_XM_SSE_INTRINSICS_)
  683. XMASSERT(pSource);
  684. // Reads an extra integer that is 'undefined'
  685. __m128i V = _mm_load_si128( (const __m128i*)pSource );
  686. return reinterpret_cast<__m128 *>(&V)[0];
  687. #else // _XM_VMX128_INTRINSICS_
  688. #endif // _XM_VMX128_INTRINSICS_
  689. }
  690. //------------------------------------------------------------------------------
  691. XMFINLINE XMVECTOR XMLoadFloat3
  692. (
  693. CONST XMFLOAT3* pSource
  694. )
  695. {
  696. #if defined(_XM_NO_INTRINSICS_)
  697. XMVECTOR V;
  698. XMASSERT(pSource);
  699. ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
  700. ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
  701. ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
  702. return V;
  703. #elif defined(_XM_SSE_INTRINSICS_)
  704. XMASSERT(pSource);
  705. #ifdef _XM_ISVS2005_
  706. // This reads 1 floats past the memory that should be ignored.
  707. // Need to continue to do this for VS 2005 due to compiler issue but prefer new method
  708. // to avoid triggering issues with memory debug tools (like AV)
  709. return _mm_loadu_ps( &pSource->x );
  710. #else
  711. __m128 x = _mm_load_ss( &pSource->x );
  712. __m128 y = _mm_load_ss( &pSource->y );
  713. __m128 z = _mm_load_ss( &pSource->z );
  714. __m128 xy = _mm_unpacklo_ps( x, y );
  715. return _mm_movelh_ps( xy, z );
  716. #endif // !_XM_ISVS2005_
  717. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  718. #endif // _XM_VMX128_INTRINSICS_
  719. }
  720. //------------------------------------------------------------------------------
  721. XMFINLINE XMVECTOR XMLoadFloat3A
  722. (
  723. CONST XMFLOAT3A* pSource
  724. )
  725. {
  726. #if defined(_XM_NO_INTRINSICS_)
  727. XMVECTOR V;
  728. XMASSERT(pSource);
  729. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  730. V.vector4_f32[0] = pSource->x;
  731. V.vector4_f32[1] = pSource->y;
  732. V.vector4_f32[2] = pSource->z;
  733. return V;
  734. #elif defined(_XM_SSE_INTRINSICS_)
  735. XMASSERT(pSource);
  736. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  737. // This reads 1 floats past the memory that should be ignored.
  738. return _mm_load_ps( &pSource->x );
  739. #else // _XM_VMX128_INTRINSICS_
  740. #endif // _XM_VMX128_INTRINSICS_
  741. }
  742. //------------------------------------------------------------------------------
  743. XMFINLINE XMVECTOR XMLoadUHenDN3
  744. (
  745. CONST XMUHENDN3* pSource
  746. )
  747. {
  748. #if defined(_XM_NO_INTRINSICS_)
  749. XMVECTOR V;
  750. UINT Element;
  751. XMASSERT(pSource);
  752. Element = pSource->v & 0x7FF;
  753. V.vector4_f32[0] = (FLOAT)Element / 2047.0f;
  754. Element = (pSource->v >> 11) & 0x7FF;
  755. V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
  756. Element = (pSource->v >> 22) & 0x3FF;
  757. V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
  758. return V;
  759. #elif defined(_XM_SSE_INTRINSICS_)
  760. static const XMVECTORF32 UHenDN3Mul = {1.0f/2047.0f,1.0f/(2047.0f*2048.0f),1.0f/(1023.0f*2048.0f*2048.0f),0};
  761. XMASSERT(pSource);
  762. // Get the 32 bit value and splat it
  763. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  764. // Mask off x, y and z
  765. vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
  766. // Convert x and y to unsigned
  767. vResult = _mm_xor_ps(vResult,g_XMFlipZ);
  768. // Convert to float
  769. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  770. // Convert x and y back to signed
  771. vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
  772. // Normalize x,y and z to -1.0f-1.0f
  773. vResult = _mm_mul_ps(vResult,UHenDN3Mul);
  774. return vResult;
  775. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  776. #endif // _XM_VMX128_INTRINSICS_
  777. }
  778. //------------------------------------------------------------------------------
  779. XMFINLINE XMVECTOR XMLoadUHenD3
  780. (
  781. CONST XMUHEND3* pSource
  782. )
  783. {
  784. #if defined(_XM_NO_INTRINSICS_)
  785. XMVECTOR V;
  786. UINT Element;
  787. XMASSERT(pSource);
  788. Element = pSource->v & 0x7FF;
  789. V.vector4_f32[0] = (FLOAT)Element;
  790. Element = (pSource->v >> 11) & 0x7FF;
  791. V.vector4_f32[1] = (FLOAT)Element;
  792. Element = (pSource->v >> 22) & 0x3FF;
  793. V.vector4_f32[2] = (FLOAT)Element;
  794. return V;
  795. #elif defined(_XM_SSE_INTRINSICS_)
  796. XMASSERT(pSource);
  797. // Get the 32 bit value and splat it
  798. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  799. // Mask off x, y and z
  800. vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
  801. // Convert x and y to unsigned
  802. vResult = _mm_xor_ps(vResult,g_XMFlipZ);
  803. // Convert to float
  804. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  805. // Convert x and y back to signed
  806. vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
  807. // Normalize x and y to -1024-1023.0f and z to -512-511.0f
  808. vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
  809. return vResult;
  810. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  811. #endif // _XM_VMX128_INTRINSICS_
  812. }
  813. //------------------------------------------------------------------------------
  814. XMFINLINE XMVECTOR XMLoadHenDN3
  815. (
  816. CONST XMHENDN3* pSource
  817. )
  818. {
  819. #if defined(_XM_NO_INTRINSICS_)
  820. XMVECTOR V;
  821. UINT Element;
  822. static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
  823. static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
  824. XMASSERT(pSource);
  825. XMASSERT((pSource->v & 0x7FF) != 0x400);
  826. XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
  827. XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
  828. Element = pSource->v & 0x7FF;
  829. V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
  830. Element = (pSource->v >> 11) & 0x7FF;
  831. V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
  832. Element = (pSource->v >> 22) & 0x3FF;
  833. V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]) / 511.0f;
  834. return V;
  835. #elif defined(_XM_SSE_INTRINSICS_)
  836. static const XMVECTORF32 HenDN3Mul = {1.0f/1023.0f,1.0f/(1023.0f*2048.0f),1.0f/(511.0f*2048.0f*2048.0f),0};
  837. XMASSERT(pSource);
  838. XMASSERT((pSource->v & 0x7FF) != 0x400);
  839. XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
  840. XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
  841. // Get the 32 bit value and splat it
  842. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  843. // Mask off x, y and z
  844. vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
  845. // Convert x and y to unsigned
  846. vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
  847. // Convert to float
  848. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  849. // Convert x and y back to signed
  850. vResult = _mm_add_ps(vResult,g_XMAddHenD3);
  851. // Normalize x,y and z to -1.0f-1.0f
  852. vResult = _mm_mul_ps(vResult,HenDN3Mul);
  853. return vResult;
  854. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  855. #endif // _XM_VMX128_INTRINSICS_
  856. }
  857. //------------------------------------------------------------------------------
  858. XMFINLINE XMVECTOR XMLoadHenD3
  859. (
  860. CONST XMHEND3* pSource
  861. )
  862. {
  863. #if defined(_XM_NO_INTRINSICS_)
  864. XMVECTOR V;
  865. UINT Element;
  866. static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
  867. static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
  868. XMASSERT(pSource);
  869. XMASSERT((pSource->v & 0x7FF) != 0x400);
  870. XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
  871. XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
  872. Element = pSource->v & 0x7FF;
  873. V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
  874. Element = (pSource->v >> 11) & 0x7FF;
  875. V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
  876. Element = (pSource->v >> 22) & 0x3FF;
  877. V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]);
  878. return V;
  879. #elif defined(_XM_SSE_INTRINSICS_)
  880. XMASSERT(pSource);
  881. XMASSERT((pSource->v & 0x7FF) != 0x400);
  882. XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
  883. XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
  884. // Get the 32 bit value and splat it
  885. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  886. // Mask off x, y and z
  887. vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
  888. // Convert x and y to unsigned
  889. vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
  890. // Convert to float
  891. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  892. // Convert x and y back to signed
  893. vResult = _mm_add_ps(vResult,g_XMAddHenD3);
  894. // Normalize x and y to -1024-1023.0f and z to -512-511.0f
  895. vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
  896. return vResult;
  897. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  898. #endif // _XM_VMX128_INTRINSICS_
  899. }
  900. //------------------------------------------------------------------------------
  901. XMFINLINE XMVECTOR XMLoadUDHenN3
  902. (
  903. CONST XMUDHENN3* pSource
  904. )
  905. {
  906. #if defined(_XM_NO_INTRINSICS_)
  907. XMVECTOR V;
  908. UINT Element;
  909. XMASSERT(pSource);
  910. Element = pSource->v & 0x3FF;
  911. V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
  912. Element = (pSource->v >> 10) & 0x7FF;
  913. V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
  914. Element = (pSource->v >> 21) & 0x7FF;
  915. V.vector4_f32[2] = (FLOAT)Element / 2047.0f;
  916. return V;
  917. #elif defined(_XM_SSE_INTRINSICS_)
  918. static const XMVECTORF32 UDHenN3Mul = {1.0f/1023.0f,1.0f/(2047.0f*1024.0f),1.0f/(2047.0f*1024.0f*2048.0f),0};
  919. XMASSERT(pSource);
  920. // Get the 32 bit value and splat it
  921. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  922. // Mask off x, y and z
  923. vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
  924. // Convert x and y to unsigned
  925. vResult = _mm_xor_ps(vResult,g_XMFlipZ);
  926. // Convert to float
  927. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  928. // Convert x and y back to signed
  929. vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
  930. // Normalize x,y and z to -1.0f-1.0f
  931. vResult = _mm_mul_ps(vResult,UDHenN3Mul);
  932. return vResult;
  933. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  934. #endif // _XM_VMX128_INTRINSICS_
  935. }
  936. //------------------------------------------------------------------------------
  937. XMFINLINE XMVECTOR XMLoadUDHen3
  938. (
  939. CONST XMUDHEN3* pSource
  940. )
  941. {
  942. #if defined(_XM_NO_INTRINSICS_)
  943. XMVECTOR V;
  944. UINT Element;
  945. XMASSERT(pSource);
  946. Element = pSource->v & 0x3FF;
  947. V.vector4_f32[0] = (FLOAT)Element;
  948. Element = (pSource->v >> 10) & 0x7FF;
  949. V.vector4_f32[1] = (FLOAT)Element;
  950. Element = (pSource->v >> 21) & 0x7FF;
  951. V.vector4_f32[2] = (FLOAT)Element;
  952. return V;
  953. #elif defined(_XM_SSE_INTRINSICS_)
  954. XMASSERT(pSource);
  955. // Get the 32 bit value and splat it
  956. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  957. // Mask off x, y and z
  958. vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
  959. // Convert x and y to unsigned
  960. vResult = _mm_xor_ps(vResult,g_XMFlipZ);
  961. // Convert to float
  962. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  963. // Convert x and y back to signed
  964. vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
  965. // Normalize x to 0-1023.0f and y and z to 0-2047.0f
  966. vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
  967. return vResult;
  968. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  969. #endif // _XM_VMX128_INTRINSICS_
  970. }
  971. //------------------------------------------------------------------------------
  972. XMFINLINE XMVECTOR XMLoadDHenN3
  973. (
  974. CONST XMDHENN3* pSource
  975. )
  976. {
  977. #if defined(_XM_NO_INTRINSICS_)
  978. XMVECTOR V;
  979. UINT Element;
  980. static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
  981. static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
  982. XMASSERT(pSource);
  983. XMASSERT((pSource->v & 0x3FF) != 0x200);
  984. XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
  985. XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
  986. Element = pSource->v & 0x3FF;
  987. V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]) / 511.0f;
  988. Element = (pSource->v >> 10) & 0x7FF;
  989. V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
  990. Element = (pSource->v >> 21) & 0x7FF;
  991. V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
  992. return V;
  993. #elif defined(_XM_SSE_INTRINSICS_)
  994. static const XMVECTORF32 DHenN3Mul = {1.0f/511.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*2048.0f),0};
  995. XMASSERT(pSource);
  996. XMASSERT((pSource->v & 0x3FF) != 0x200);
  997. XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
  998. XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
  999. // Get the 32 bit value and splat it
  1000. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1001. // Mask off x, y and z
  1002. vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
  1003. // Convert x and y to unsigned
  1004. vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
  1005. // Convert to float
  1006. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  1007. // Convert x and y back to signed
  1008. vResult = _mm_add_ps(vResult,g_XMAddDHen3);
  1009. // Normalize x,y and z to -1.0f-1.0f
  1010. vResult = _mm_mul_ps(vResult,DHenN3Mul);
  1011. return vResult;
  1012. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1013. #endif // _XM_VMX128_INTRINSICS_
  1014. }
  1015. //------------------------------------------------------------------------------
  1016. XMFINLINE XMVECTOR XMLoadDHen3
  1017. (
  1018. CONST XMDHEN3* pSource
  1019. )
  1020. {
  1021. #if defined(_XM_NO_INTRINSICS_)
  1022. XMVECTOR V;
  1023. UINT Element;
  1024. static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
  1025. static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
  1026. XMASSERT(pSource);
  1027. XMASSERT((pSource->v & 0x3FF) != 0x200);
  1028. XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
  1029. XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
  1030. Element = pSource->v & 0x3FF;
  1031. V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]);
  1032. Element = (pSource->v >> 10) & 0x7FF;
  1033. V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
  1034. Element = (pSource->v >> 21) & 0x7FF;
  1035. V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
  1036. return V;
  1037. #elif defined(_XM_SSE_INTRINSICS_)
  1038. XMASSERT(pSource);
  1039. XMASSERT((pSource->v & 0x3FF) != 0x200);
  1040. XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
  1041. XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
  1042. // Get the 32 bit value and splat it
  1043. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1044. // Mask off x, y and z
  1045. vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
  1046. // Convert x and y to unsigned
  1047. vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
  1048. // Convert to float
  1049. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  1050. // Convert x and y back to signed
  1051. vResult = _mm_add_ps(vResult,g_XMAddDHen3);
  1052. // Normalize x to -210-511.0f and y and z to -1024-1023.0f
  1053. vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
  1054. return vResult;
  1055. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1056. #endif // _XM_VMX128_INTRINSICS_
  1057. }
  1058. //------------------------------------------------------------------------------
  1059. XMFINLINE XMVECTOR XMLoadU565
  1060. (
  1061. CONST XMU565* pSource
  1062. )
  1063. {
  1064. #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  1065. static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
  1066. static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
  1067. XMASSERT(pSource);
  1068. // Get the 32 bit value and splat it
  1069. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1070. // Mask off x, y and z
  1071. vResult = _mm_and_ps(vResult,U565And);
  1072. // Convert to float
  1073. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  1074. // Normalize x, y, and z
  1075. vResult = _mm_mul_ps(vResult,U565Mul);
  1076. return vResult;
  1077. #else
  1078. XMVECTOR V;
  1079. UINT Element;
  1080. XMASSERT(pSource);
  1081. Element = pSource->v & 0x1F;
  1082. V.vector4_f32[0] = (FLOAT)Element;
  1083. Element = (pSource->v >> 5) & 0x3F;
  1084. V.vector4_f32[1] = (FLOAT)Element;
  1085. Element = (pSource->v >> 11) & 0x1F;
  1086. V.vector4_f32[2] = (FLOAT)Element;
  1087. return V;
  1088. #endif // !_XM_SSE_INTRINSICS_
  1089. }
  1090. //------------------------------------------------------------------------------
  1091. XMFINLINE XMVECTOR XMLoadFloat3PK
  1092. (
  1093. CONST XMFLOAT3PK* pSource
  1094. )
  1095. {
  1096. _DECLSPEC_ALIGN_16_ UINT Result[4];
  1097. UINT Mantissa;
  1098. UINT Exponent;
  1099. XMASSERT(pSource);
  1100. // X Channel (6-bit mantissa)
  1101. Mantissa = pSource->xm;
  1102. if ( pSource->xe == 0x1f ) // INF or NAN
  1103. {
  1104. Result[0] = 0x7f800000 | (pSource->xm << 17);
  1105. }
  1106. else
  1107. {
  1108. if ( pSource->xe != 0 ) // The value is normalized
  1109. {
  1110. Exponent = pSource->xe;
  1111. }
  1112. else if (Mantissa != 0) // The value is denormalized
  1113. {
  1114. // Normalize the value in the resulting float
  1115. Exponent = 1;
  1116. do
  1117. {
  1118. Exponent--;
  1119. Mantissa <<= 1;
  1120. } while ((Mantissa & 0x40) == 0);
  1121. Mantissa &= 0x3F;
  1122. }
  1123. else // The value is zero
  1124. {
  1125. Exponent = (UINT)-112;
  1126. }
  1127. Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
  1128. }
  1129. // Y Channel (6-bit mantissa)
  1130. Mantissa = pSource->ym;
  1131. if ( pSource->ye == 0x1f ) // INF or NAN
  1132. {
  1133. Result[1] = 0x7f800000 | (pSource->ym << 17);
  1134. }
  1135. else
  1136. {
  1137. if ( pSource->ye != 0 ) // The value is normalized
  1138. {
  1139. Exponent = pSource->ye;
  1140. }
  1141. else if (Mantissa != 0) // The value is denormalized
  1142. {
  1143. // Normalize the value in the resulting float
  1144. Exponent = 1;
  1145. do
  1146. {
  1147. Exponent--;
  1148. Mantissa <<= 1;
  1149. } while ((Mantissa & 0x40) == 0);
  1150. Mantissa &= 0x3F;
  1151. }
  1152. else // The value is zero
  1153. {
  1154. Exponent = (UINT)-112;
  1155. }
  1156. Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
  1157. }
  1158. // Z Channel (5-bit mantissa)
  1159. Mantissa = pSource->zm;
  1160. if ( pSource->ze == 0x1f ) // INF or NAN
  1161. {
  1162. Result[2] = 0x7f800000 | (pSource->zm << 17);
  1163. }
  1164. else
  1165. {
  1166. if ( pSource->ze != 0 ) // The value is normalized
  1167. {
  1168. Exponent = pSource->ze;
  1169. }
  1170. else if (Mantissa != 0) // The value is denormalized
  1171. {
  1172. // Normalize the value in the resulting float
  1173. Exponent = 1;
  1174. do
  1175. {
  1176. Exponent--;
  1177. Mantissa <<= 1;
  1178. } while ((Mantissa & 0x20) == 0);
  1179. Mantissa &= 0x1F;
  1180. }
  1181. else // The value is zero
  1182. {
  1183. Exponent = (UINT)-112;
  1184. }
  1185. Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
  1186. }
  1187. return XMLoadFloat3A( (XMFLOAT3A*)&Result );
  1188. }
  1189. //------------------------------------------------------------------------------
  1190. XMFINLINE XMVECTOR XMLoadFloat3SE
  1191. (
  1192. CONST XMFLOAT3SE* pSource
  1193. )
  1194. {
  1195. _DECLSPEC_ALIGN_16_ UINT Result[4];
  1196. UINT Mantissa;
  1197. UINT Exponent, ExpBits;
  1198. XMASSERT(pSource);
  1199. if ( pSource->e == 0x1f ) // INF or NAN
  1200. {
  1201. Result[0] = 0x7f800000 | (pSource->xm << 14);
  1202. Result[1] = 0x7f800000 | (pSource->ym << 14);
  1203. Result[2] = 0x7f800000 | (pSource->zm << 14);
  1204. }
  1205. else if ( pSource->e != 0 ) // The values are all normalized
  1206. {
  1207. Exponent = pSource->e;
  1208. ExpBits = (Exponent + 112) << 23;
  1209. Mantissa = pSource->xm;
  1210. Result[0] = ExpBits | (Mantissa << 14);
  1211. Mantissa = pSource->ym;
  1212. Result[1] = ExpBits | (Mantissa << 14);
  1213. Mantissa = pSource->zm;
  1214. Result[2] = ExpBits | (Mantissa << 14);
  1215. }
  1216. else
  1217. {
  1218. // X Channel
  1219. Mantissa = pSource->xm;
  1220. if (Mantissa != 0) // The value is denormalized
  1221. {
  1222. // Normalize the value in the resulting float
  1223. Exponent = 1;
  1224. do
  1225. {
  1226. Exponent--;
  1227. Mantissa <<= 1;
  1228. } while ((Mantissa & 0x200) == 0);
  1229. Mantissa &= 0x1FF;
  1230. }
  1231. else // The value is zero
  1232. {
  1233. Exponent = (UINT)-112;
  1234. }
  1235. Result[0] = ((Exponent + 112) << 23) | (Mantissa << 14);
  1236. // Y Channel
  1237. Mantissa = pSource->ym;
  1238. if (Mantissa != 0) // The value is denormalized
  1239. {
  1240. // Normalize the value in the resulting float
  1241. Exponent = 1;
  1242. do
  1243. {
  1244. Exponent--;
  1245. Mantissa <<= 1;
  1246. } while ((Mantissa & 0x200) == 0);
  1247. Mantissa &= 0x1FF;
  1248. }
  1249. else // The value is zero
  1250. {
  1251. Exponent = (UINT)-112;
  1252. }
  1253. Result[1] = ((Exponent + 112) << 23) | (Mantissa << 14);
  1254. // Z Channel
  1255. Mantissa = pSource->zm;
  1256. if (Mantissa != 0) // The value is denormalized
  1257. {
  1258. // Normalize the value in the resulting float
  1259. Exponent = 1;
  1260. do
  1261. {
  1262. Exponent--;
  1263. Mantissa <<= 1;
  1264. } while ((Mantissa & 0x200) == 0);
  1265. Mantissa &= 0x1FF;
  1266. }
  1267. else // The value is zero
  1268. {
  1269. Exponent = (UINT)-112;
  1270. }
  1271. Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14);
  1272. }
  1273. return XMLoadFloat3A( (XMFLOAT3A*)&Result );
  1274. }
  1275. //------------------------------------------------------------------------------
  1276. XMFINLINE XMVECTOR XMLoadInt4
  1277. (
  1278. CONST UINT* pSource
  1279. )
  1280. {
  1281. #if defined(_XM_NO_INTRINSICS_)
  1282. XMVECTOR V;
  1283. XMASSERT(pSource);
  1284. V.vector4_u32[0] = pSource[0];
  1285. V.vector4_u32[1] = pSource[1];
  1286. V.vector4_u32[2] = pSource[2];
  1287. V.vector4_u32[3] = pSource[3];
  1288. return V;
  1289. #elif defined(_XM_SSE_INTRINSICS_)
  1290. XMASSERT(pSource);
  1291. __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
  1292. return reinterpret_cast<__m128 *>(&V)[0];
  1293. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1294. #endif // _XM_VMX128_INTRINSICS_
  1295. }
  1296. //------------------------------------------------------------------------------
  1297. XMFINLINE XMVECTOR XMLoadInt4A
  1298. (
  1299. CONST UINT* pSource
  1300. )
  1301. {
  1302. #if defined(_XM_NO_INTRINSICS_)
  1303. XMVECTOR V;
  1304. XMASSERT(pSource);
  1305. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  1306. V.vector4_u32[0] = pSource[0];
  1307. V.vector4_u32[1] = pSource[1];
  1308. V.vector4_u32[2] = pSource[2];
  1309. V.vector4_u32[3] = pSource[3];
  1310. return V;
  1311. #elif defined(_XM_SSE_INTRINSICS_)
  1312. XMASSERT(pSource);
  1313. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  1314. __m128i V = _mm_load_si128( (const __m128i*)pSource );
  1315. return reinterpret_cast<__m128 *>(&V)[0];
  1316. #else // _XM_VMX128_INTRINSICS_
  1317. #endif // _XM_VMX128_INTRINSICS_
  1318. }
  1319. //------------------------------------------------------------------------------
  1320. XMFINLINE XMVECTOR XMLoadFloat4
  1321. (
  1322. CONST XMFLOAT4* pSource
  1323. )
  1324. {
  1325. #if defined(_XM_NO_INTRINSICS_)
  1326. XMVECTOR V;
  1327. XMASSERT(pSource);
  1328. ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
  1329. ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
  1330. ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
  1331. ((UINT *)(&V.vector4_f32[3]))[0] = ((const UINT *)(&pSource->w))[0];
  1332. return V;
  1333. #elif defined(_XM_SSE_INTRINSICS_)
  1334. XMASSERT(pSource);
  1335. return _mm_loadu_ps( &pSource->x );
  1336. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1337. #endif // _XM_VMX128_INTRINSICS_
  1338. }
  1339. //------------------------------------------------------------------------------
  1340. XMFINLINE XMVECTOR XMLoadFloat4A
  1341. (
  1342. CONST XMFLOAT4A* pSource
  1343. )
  1344. {
  1345. #if defined(_XM_NO_INTRINSICS_)
  1346. XMVECTOR V;
  1347. XMASSERT(pSource);
  1348. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  1349. V.vector4_f32[0] = pSource->x;
  1350. V.vector4_f32[1] = pSource->y;
  1351. V.vector4_f32[2] = pSource->z;
  1352. V.vector4_f32[3] = pSource->w;
  1353. return V;
  1354. #elif defined(_XM_SSE_INTRINSICS_)
  1355. XMASSERT(pSource);
  1356. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  1357. return _mm_load_ps( &pSource->x );
  1358. #else // _XM_VMX128_INTRINSICS_
  1359. #endif // _XM_VMX128_INTRINSICS_
  1360. }
  1361. //------------------------------------------------------------------------------
  1362. XMFINLINE XMVECTOR XMLoadHalf4
  1363. (
  1364. CONST XMHALF4* pSource
  1365. )
  1366. {
  1367. #if defined(_XM_NO_INTRINSICS_)
  1368. XMASSERT(pSource);
  1369. {
  1370. XMVECTOR vResult = {
  1371. XMConvertHalfToFloat(pSource->x),
  1372. XMConvertHalfToFloat(pSource->y),
  1373. XMConvertHalfToFloat(pSource->z),
  1374. XMConvertHalfToFloat(pSource->w)
  1375. };
  1376. return vResult;
  1377. }
  1378. #elif defined(_XM_SSE_INTRINSICS_)
  1379. XMASSERT(pSource);
  1380. XMVECTOR vResult = {
  1381. XMConvertHalfToFloat(pSource->x),
  1382. XMConvertHalfToFloat(pSource->y),
  1383. XMConvertHalfToFloat(pSource->z),
  1384. XMConvertHalfToFloat(pSource->w)
  1385. };
  1386. return vResult;
  1387. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1388. #endif // _XM_VMX128_INTRINSICS_
  1389. }
  1390. //------------------------------------------------------------------------------
  1391. XMFINLINE XMVECTOR XMLoadShortN4
  1392. (
  1393. CONST XMSHORTN4* pSource
  1394. )
  1395. {
  1396. #if defined(_XM_NO_INTRINSICS_)
  1397. XMASSERT(pSource);
  1398. XMASSERT(pSource->x != -32768);
  1399. XMASSERT(pSource->y != -32768);
  1400. XMASSERT(pSource->z != -32768);
  1401. XMASSERT(pSource->w != -32768);
  1402. {
  1403. XMVECTOR vResult = {
  1404. (FLOAT)pSource->x * (1.0f/32767.0f),
  1405. (FLOAT)pSource->y * (1.0f/32767.0f),
  1406. (FLOAT)pSource->z * (1.0f/32767.0f),
  1407. (FLOAT)pSource->w * (1.0f/32767.0f)
  1408. };
  1409. return vResult;
  1410. }
  1411. #elif defined(_XM_SSE_INTRINSICS_)
  1412. XMASSERT(pSource);
  1413. XMASSERT(pSource->x != -32768);
  1414. XMASSERT(pSource->y != -32768);
  1415. XMASSERT(pSource->z != -32768);
  1416. XMASSERT(pSource->w != -32768);
  1417. // Splat the color in all four entries (x,z,y,w)
  1418. __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
  1419. // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
  1420. __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
  1421. // x and z are unsigned! Flip the bits to convert the order to signed
  1422. vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
  1423. // Convert to floating point numbers
  1424. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1425. // x and z - 0x8000 to complete the conversion
  1426. vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
  1427. // Convert -32767-32767 to -1.0f-1.0f
  1428. vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
  1429. // Very important! The entries are x,z,y,w, flip it to x,y,z,w
  1430. vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
  1431. return vTemp;
  1432. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1433. #endif // _XM_VMX128_INTRINSICS_
  1434. }
  1435. //------------------------------------------------------------------------------
  1436. XMFINLINE XMVECTOR XMLoadShort4
  1437. (
  1438. CONST XMSHORT4* pSource
  1439. )
  1440. {
  1441. #if defined(_XM_NO_INTRINSICS_)
  1442. XMVECTOR V;
  1443. XMASSERT(pSource);
  1444. XMASSERT(pSource->x != -32768);
  1445. XMASSERT(pSource->y != -32768);
  1446. XMASSERT(pSource->z != -32768);
  1447. XMASSERT(pSource->w != -32768);
  1448. V.vector4_f32[0] = (FLOAT)pSource->x;
  1449. V.vector4_f32[1] = (FLOAT)pSource->y;
  1450. V.vector4_f32[2] = (FLOAT)pSource->z;
  1451. V.vector4_f32[3] = (FLOAT)pSource->w;
  1452. return V;
  1453. #elif defined(_XM_SSE_INTRINSICS_)
  1454. XMASSERT(pSource);
  1455. XMASSERT(pSource->x != -32768);
  1456. XMASSERT(pSource->y != -32768);
  1457. XMASSERT(pSource->z != -32768);
  1458. XMASSERT(pSource->w != -32768);
  1459. // Splat the color in all four entries (x,z,y,w)
  1460. __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
  1461. // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
  1462. __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
  1463. // x and z are unsigned! Flip the bits to convert the order to signed
  1464. vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
  1465. // Convert to floating point numbers
  1466. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1467. // x and z - 0x8000 to complete the conversion
  1468. vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
  1469. // Fix y and w because they are 65536 too large
  1470. vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
  1471. // Very important! The entries are x,z,y,w, flip it to x,y,z,w
  1472. return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
  1473. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1474. #endif // _XM_VMX128_INTRINSICS_
  1475. }
  1476. //------------------------------------------------------------------------------
  1477. XMFINLINE XMVECTOR XMLoadUShortN4
  1478. (
  1479. CONST XMUSHORTN4* pSource
  1480. )
  1481. {
  1482. #if defined(_XM_NO_INTRINSICS_)
  1483. XMVECTOR V;
  1484. XMASSERT(pSource);
  1485. V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
  1486. V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
  1487. V.vector4_f32[2] = (FLOAT)pSource->z / 65535.0f;
  1488. V.vector4_f32[3] = (FLOAT)pSource->w / 65535.0f;
  1489. return V;
  1490. #elif defined(_XM_SSE_INTRINSICS_)
  1491. XMASSERT(pSource);
  1492. static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)};
  1493. static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f*65536.0f,32768.0f*65536.0f};
  1494. XMASSERT(pSource);
  1495. // Splat the color in all four entries (x,z,y,w)
  1496. __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
  1497. // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
  1498. __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
  1499. // y and w are signed! Flip the bits to convert the order to unsigned
  1500. vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
  1501. // Convert to floating point numbers
  1502. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1503. // y and w + 0x8000 to complete the conversion
  1504. vTemp = _mm_add_ps(vTemp,FixaddY16W16);
  1505. // Fix y and w because they are 65536 too large
  1506. vTemp = _mm_mul_ps(vTemp,FixupY16W16);
  1507. // Very important! The entries are x,z,y,w, flip it to x,y,z,w
  1508. return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
  1509. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1510. #endif // _XM_VMX128_INTRINSICS_
  1511. }
  1512. //------------------------------------------------------------------------------
  1513. XMFINLINE XMVECTOR XMLoadUShort4
  1514. (
  1515. CONST XMUSHORT4* pSource
  1516. )
  1517. {
  1518. #if defined(_XM_NO_INTRINSICS_)
  1519. XMVECTOR V;
  1520. XMASSERT(pSource);
  1521. V.vector4_f32[0] = (FLOAT)pSource->x;
  1522. V.vector4_f32[1] = (FLOAT)pSource->y;
  1523. V.vector4_f32[2] = (FLOAT)pSource->z;
  1524. V.vector4_f32[3] = (FLOAT)pSource->w;
  1525. return V;
  1526. #elif defined(_XM_SSE_INTRINSICS_)
  1527. XMASSERT(pSource);
  1528. static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f,32768.0f};
  1529. XMASSERT(pSource);
  1530. // Splat the color in all four entries (x,z,y,w)
  1531. __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
  1532. // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
  1533. __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
  1534. // y and w are signed! Flip the bits to convert the order to unsigned
  1535. vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
  1536. // Convert to floating point numbers
  1537. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1538. // Fix y and w because they are 65536 too large
  1539. vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
  1540. // y and w + 0x8000 to complete the conversion
  1541. vTemp = _mm_add_ps(vTemp,FixaddY16W16);
  1542. // Very important! The entries are x,z,y,w, flip it to x,y,z,w
  1543. return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
  1544. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1545. #endif // _XM_VMX128_INTRINSICS_
  1546. }
  1547. //------------------------------------------------------------------------------
  1548. XMFINLINE XMVECTOR XMLoadXIcoN4
  1549. (
  1550. CONST XMXICON4* pSource
  1551. )
  1552. {
  1553. #if defined(_XM_NO_INTRINSICS_)
  1554. XMVECTOR V;
  1555. UINT Element;
  1556. static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
  1557. XMASSERT(pSource);
  1558. XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
  1559. XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
  1560. XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
  1561. Element = (UINT)(pSource->v & 0xFFFFF);
  1562. V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
  1563. Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
  1564. V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
  1565. Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
  1566. V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
  1567. V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
  1568. return V;
  1569. #elif defined(_XM_SSE_INTRINSICS_)
  1570. XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
  1571. XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
  1572. XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
  1573. static const XMVECTORF32 LoadXIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(15.0f*4096.0f*65536.0f)};
  1574. XMASSERT(pSource);
  1575. // Grab the 64 bit structure
  1576. __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
  1577. // By shifting down 8 bits, y and z are in seperate 32 bit elements
  1578. __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
  1579. // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
  1580. XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
  1581. // Fix the entries to x,y,z,w
  1582. vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
  1583. // Mask x,y,z and w
  1584. vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
  1585. // x and z are unsigned! Flip the bits to convert the order to signed
  1586. vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
  1587. // Convert to floating point numbers
  1588. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1589. // x and z - 0x80 to complete the conversion
  1590. vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
  1591. // Fix y and w because they are too large
  1592. vTemp = _mm_mul_ps(vTemp,LoadXIcoN4Mul);
  1593. return vTemp;
  1594. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1595. #endif // _XM_VMX128_INTRINSICS_
  1596. }
  1597. //------------------------------------------------------------------------------
  1598. XMFINLINE XMVECTOR XMLoadXIco4
  1599. (
  1600. CONST XMXICO4* pSource
  1601. )
  1602. {
  1603. #if defined(_XM_NO_INTRINSICS_)
  1604. XMVECTOR V;
  1605. UINT Element;
  1606. static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
  1607. XMASSERT(pSource);
  1608. XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
  1609. XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
  1610. XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
  1611. Element = (UINT)(pSource->v & 0xFFFFF);
  1612. V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
  1613. Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
  1614. V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
  1615. Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
  1616. V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
  1617. V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
  1618. return V;
  1619. #elif defined(_XM_SSE_INTRINSICS_)
  1620. XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
  1621. XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
  1622. XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
  1623. XMASSERT(pSource);
  1624. // Grab the 64 bit structure
  1625. __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
  1626. // By shifting down 8 bits, y and z are in seperate 32 bit elements
  1627. __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
  1628. // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
  1629. XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
  1630. // Fix the entries to x,y,z,w
  1631. vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
  1632. // Mask x,y,z and w
  1633. vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
  1634. // x and z are unsigned! Flip the bits to convert the order to signed
  1635. vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
  1636. // Convert to floating point numbers
  1637. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1638. // x and z - 0x80 to complete the conversion
  1639. vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
  1640. // Fix y and w because they are too large
  1641. vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
  1642. return vTemp;
  1643. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1644. #endif // _XM_VMX128_INTRINSICS_
  1645. }
  1646. //------------------------------------------------------------------------------
  1647. XMFINLINE XMVECTOR XMLoadUIcoN4
  1648. (
  1649. CONST XMUICON4* pSource
  1650. )
  1651. {
  1652. #if defined(_XM_NO_INTRINSICS_)
  1653. XMVECTOR V;
  1654. XMASSERT(pSource);
  1655. V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF) / 1048575.0f;
  1656. V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF) / 1048575.0f;
  1657. V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF) / 1048575.0f;
  1658. V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
  1659. return V;
  1660. #elif defined(_XM_SSE_INTRINSICS_)
  1661. static const XMVECTORF32 LoadUIcoN4Mul = {1.0f/1048575.0f,1.0f/(1048575.0f*4096.0f),1.0f/1048575.0f,1.0f/(15.0f*4096.0f*65536.0f)};
  1662. XMASSERT(pSource);
  1663. // Grab the 64 bit structure
  1664. __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
  1665. // By shifting down 8 bits, y and z are in seperate 32 bit elements
  1666. __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
  1667. // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
  1668. XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
  1669. // Fix the entries to x,y,z,w
  1670. vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
  1671. // Mask x,y,z and w
  1672. vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
  1673. // x and z are unsigned! Flip the bits to convert the order to signed
  1674. vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
  1675. // Convert to floating point numbers
  1676. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1677. // x and z - 0x80 to complete the conversion
  1678. vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
  1679. // Fix y and w because they are too large
  1680. vTemp = _mm_mul_ps(vTemp,LoadUIcoN4Mul);
  1681. return vTemp;
  1682. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1683. #endif // _XM_VMX128_INTRINSICS_
  1684. }
  1685. //------------------------------------------------------------------------------
  1686. XMFINLINE XMVECTOR XMLoadUIco4
  1687. (
  1688. CONST XMUICO4* pSource
  1689. )
  1690. {
  1691. #if defined(_XM_NO_INTRINSICS_)
  1692. XMVECTOR V;
  1693. XMASSERT(pSource);
  1694. V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF);
  1695. V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF);
  1696. V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF);
  1697. V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
  1698. return V;
  1699. #elif defined(_XM_SSE_INTRINSICS_)
  1700. XMASSERT(pSource);
  1701. // Grab the 64 bit structure
  1702. __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
  1703. // By shifting down 8 bits, y and z are in seperate 32 bit elements
  1704. __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
  1705. // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
  1706. XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
  1707. // Fix the entries to x,y,z,w
  1708. vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
  1709. // Mask x,y,z and w
  1710. vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
  1711. // x and z are unsigned! Flip the bits to convert the order to signed
  1712. vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
  1713. // Convert to floating point numbers
  1714. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1715. // x and z - 0x80 to complete the conversion
  1716. vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
  1717. // Fix y and w because they are too large
  1718. vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
  1719. return vTemp;
  1720. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1721. #endif // _XM_VMX128_INTRINSICS_
  1722. }
  1723. //------------------------------------------------------------------------------
  1724. XMFINLINE XMVECTOR XMLoadIcoN4
  1725. (
  1726. CONST XMICON4* pSource
  1727. )
  1728. {
  1729. #if defined(_XM_NO_INTRINSICS_)
  1730. XMVECTOR V;
  1731. UINT Element;
  1732. static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
  1733. static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
  1734. XMASSERT(pSource);
  1735. Element = (UINT)(pSource->v & 0xFFFFF);
  1736. V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
  1737. Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
  1738. V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
  1739. Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
  1740. V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
  1741. Element = (UINT)(pSource->v >> 60);
  1742. V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]) / 7.0f;
  1743. return V;
  1744. #elif defined(_XM_SSE_INTRINSICS_)
  1745. static const XMVECTORF32 LoadIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(7.0f*4096.0f*65536.0f)};
  1746. XMASSERT(pSource);
  1747. // Grab the 64 bit structure
  1748. __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
  1749. // By shifting down 8 bits, y and z are in seperate 32 bit elements
  1750. __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
  1751. // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
  1752. XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
  1753. // Fix the entries to x,y,z,w
  1754. vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
  1755. // Mask x,y,z and w
  1756. vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
  1757. // x and z are unsigned! Flip the bits to convert the order to signed
  1758. vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
  1759. // Convert to floating point numbers
  1760. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1761. // x and z - 0x80 to complete the conversion
  1762. vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
  1763. // Fix y and w because they are too large
  1764. vTemp = _mm_mul_ps(vTemp,LoadIcoN4Mul);
  1765. return vTemp;
  1766. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1767. #endif // _XM_VMX128_INTRINSICS_
  1768. }
  1769. //------------------------------------------------------------------------------
  1770. XMFINLINE XMVECTOR XMLoadIco4
  1771. (
  1772. CONST XMICO4* pSource
  1773. )
  1774. {
  1775. #if defined(_XM_NO_INTRINSICS_)
  1776. XMVECTOR V;
  1777. UINT Element;
  1778. static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
  1779. static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
  1780. XMASSERT(pSource);
  1781. Element = (UINT)(pSource->v & 0xFFFFF);
  1782. V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
  1783. Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
  1784. V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
  1785. Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
  1786. V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
  1787. Element = (UINT)(pSource->v >> 60);
  1788. V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]);
  1789. return V;
  1790. #elif defined(_XM_SSE_INTRINSICS_)
  1791. XMASSERT(pSource);
  1792. // Grab the 64 bit structure
  1793. __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
  1794. // By shifting down 8 bits, y and z are in seperate 32 bit elements
  1795. __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
  1796. // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
  1797. XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
  1798. // Fix the entries to x,y,z,w
  1799. vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
  1800. // Mask x,y,z and w
  1801. vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
  1802. // x and z are unsigned! Flip the bits to convert the order to signed
  1803. vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
  1804. // Convert to floating point numbers
  1805. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1806. // x and z - 0x80 to complete the conversion
  1807. vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
  1808. // Fix y and w because they are too large
  1809. vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
  1810. return vTemp;
  1811. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1812. #endif // _XM_VMX128_INTRINSICS_
  1813. }
  1814. //------------------------------------------------------------------------------
  1815. XMFINLINE XMVECTOR XMLoadXDecN4
  1816. (
  1817. CONST XMXDECN4* pSource
  1818. )
  1819. {
  1820. #if defined(_XM_NO_INTRINSICS_)
  1821. XMVECTOR V;
  1822. UINT Element;
  1823. static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
  1824. XMASSERT(pSource);
  1825. XMASSERT((pSource->v & 0x3FF) != 0x200);
  1826. XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
  1827. XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
  1828. Element = pSource->v & 0x3FF;
  1829. V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
  1830. Element = (pSource->v >> 10) & 0x3FF;
  1831. V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
  1832. Element = (pSource->v >> 20) & 0x3FF;
  1833. V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
  1834. V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
  1835. return V;
  1836. #elif defined(_XM_SSE_INTRINSICS_)
  1837. XMASSERT(pSource);
  1838. // Splat the color in all four entries
  1839. __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1840. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1841. vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
  1842. // a is unsigned! Flip the bit to convert the order to signed
  1843. vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
  1844. // Convert to floating point numbers
  1845. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1846. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1847. vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
  1848. // Convert 0-255 to 0.0f-1.0f
  1849. return _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
  1850. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1851. #endif // _XM_VMX128_INTRINSICS_
  1852. }
  1853. //------------------------------------------------------------------------------
  1854. XMFINLINE XMVECTOR XMLoadXDec4
  1855. (
  1856. CONST XMXDEC4* pSource
  1857. )
  1858. {
  1859. #if defined(_XM_NO_INTRINSICS_)
  1860. XMVECTOR V;
  1861. UINT Element;
  1862. static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
  1863. XMASSERT(pSource);
  1864. XMASSERT((pSource->v & 0x3FF) != 0x200);
  1865. XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
  1866. XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
  1867. Element = pSource->v & 0x3FF;
  1868. V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
  1869. Element = (pSource->v >> 10) & 0x3FF;
  1870. V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
  1871. Element = (pSource->v >> 20) & 0x3FF;
  1872. V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
  1873. V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
  1874. return V;
  1875. #elif defined(_XM_SSE_INTRINSICS_)
  1876. XMASSERT((pSource->v & 0x3FF) != 0x200);
  1877. XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
  1878. XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
  1879. static const XMVECTORI32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
  1880. static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
  1881. XMASSERT(pSource);
  1882. // Splat the color in all four entries
  1883. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1884. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1885. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  1886. // a is unsigned! Flip the bit to convert the order to signed
  1887. vTemp = _mm_xor_ps(vTemp,XDec4Xor);
  1888. // Convert to floating point numbers
  1889. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1890. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1891. vTemp = _mm_add_ps(vTemp,XDec4Add);
  1892. // Convert 0-255 to 0.0f-1.0f
  1893. vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
  1894. return vTemp;
  1895. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1896. #endif // _XM_VMX128_INTRINSICS_
  1897. }
  1898. //------------------------------------------------------------------------------
  1899. XMFINLINE XMVECTOR XMLoadUDecN4
  1900. (
  1901. CONST XMUDECN4* pSource
  1902. )
  1903. {
  1904. #if defined(_XM_NO_INTRINSICS_)
  1905. XMVECTOR V;
  1906. UINT Element;
  1907. XMASSERT(pSource);
  1908. Element = pSource->v & 0x3FF;
  1909. V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
  1910. Element = (pSource->v >> 10) & 0x3FF;
  1911. V.vector4_f32[1] = (FLOAT)Element / 1023.0f;
  1912. Element = (pSource->v >> 20) & 0x3FF;
  1913. V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
  1914. V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
  1915. return V;
  1916. #elif defined(_XM_SSE_INTRINSICS_)
  1917. XMASSERT(pSource);
  1918. static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
  1919. // Splat the color in all four entries
  1920. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1921. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1922. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  1923. // a is unsigned! Flip the bit to convert the order to signed
  1924. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  1925. // Convert to floating point numbers
  1926. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1927. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1928. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  1929. // Convert 0-255 to 0.0f-1.0f
  1930. vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
  1931. return vTemp;
  1932. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1933. #endif // _XM_VMX128_INTRINSICS_
  1934. }
  1935. //------------------------------------------------------------------------------
  1936. XMFINLINE XMVECTOR XMLoadUDec4
  1937. (
  1938. CONST XMUDEC4* pSource
  1939. )
  1940. {
  1941. #if defined(_XM_NO_INTRINSICS_)
  1942. XMVECTOR V;
  1943. UINT Element;
  1944. XMASSERT(pSource);
  1945. Element = pSource->v & 0x3FF;
  1946. V.vector4_f32[0] = (FLOAT)Element;
  1947. Element = (pSource->v >> 10) & 0x3FF;
  1948. V.vector4_f32[1] = (FLOAT)Element;
  1949. Element = (pSource->v >> 20) & 0x3FF;
  1950. V.vector4_f32[2] = (FLOAT)Element;
  1951. V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
  1952. return V;
  1953. #elif defined(_XM_SSE_INTRINSICS_)
  1954. XMASSERT(pSource);
  1955. // Splat the color in all four entries
  1956. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  1957. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  1958. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  1959. // a is unsigned! Flip the bit to convert the order to signed
  1960. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  1961. // Convert to floating point numbers
  1962. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  1963. // RGB + 0, A + 0x80000000.f to undo the signed order.
  1964. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  1965. // Convert 0-255 to 0.0f-1.0f
  1966. vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
  1967. return vTemp;
  1968. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  1969. #endif // _XM_VMX128_INTRINSICS_
  1970. }
  1971. //------------------------------------------------------------------------------
  1972. XMFINLINE XMVECTOR XMLoadDecN4
  1973. (
  1974. CONST XMDECN4* pSource
  1975. )
  1976. {
  1977. #if defined(_XM_NO_INTRINSICS_)
  1978. XMVECTOR V;
  1979. UINT Element;
  1980. static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
  1981. static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
  1982. XMASSERT(pSource);
  1983. XMASSERT((pSource->v & 0x3FF) != 0x200);
  1984. XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
  1985. XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
  1986. XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
  1987. Element = pSource->v & 0x3FF;
  1988. V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
  1989. Element = (pSource->v >> 10) & 0x3FF;
  1990. V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
  1991. Element = (pSource->v >> 20) & 0x3FF;
  1992. V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
  1993. Element = pSource->v >> 30;
  1994. V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
  1995. return V;
  1996. #elif defined(_XM_SSE_INTRINSICS_)
  1997. XMASSERT(pSource);
  1998. XMASSERT((pSource->v & 0x3FF) != 0x200);
  1999. XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
  2000. XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
  2001. XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
  2002. static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
  2003. // Splat the color in all four entries
  2004. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  2005. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  2006. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  2007. // a is unsigned! Flip the bit to convert the order to signed
  2008. vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
  2009. // Convert to floating point numbers
  2010. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  2011. // RGB + 0, A + 0x80000000.f to undo the signed order.
  2012. vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
  2013. // Convert 0-255 to 0.0f-1.0f
  2014. vTemp = _mm_mul_ps(vTemp,DecN4Mul);
  2015. return vTemp;
  2016. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2017. #endif // _XM_VMX128_INTRINSICS_
  2018. }
  2019. //------------------------------------------------------------------------------
  2020. XMFINLINE XMVECTOR XMLoadDec4
  2021. (
  2022. CONST XMDEC4* pSource
  2023. )
  2024. {
  2025. #if defined(_XM_NO_INTRINSICS_)
  2026. XMVECTOR V;
  2027. UINT Element;
  2028. static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
  2029. static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
  2030. XMASSERT(pSource);
  2031. XMASSERT((pSource->v & 0x3FF) != 0x200);
  2032. XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
  2033. XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
  2034. XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
  2035. Element = pSource->v & 0x3FF;
  2036. V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
  2037. Element = (pSource->v >> 10) & 0x3FF;
  2038. V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
  2039. Element = (pSource->v >> 20) & 0x3FF;
  2040. V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
  2041. Element = pSource->v >> 30;
  2042. V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
  2043. return V;
  2044. #elif defined(_XM_SSE_INTRINSICS_)
  2045. XMASSERT((pSource->v & 0x3FF) != 0x200);
  2046. XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
  2047. XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
  2048. XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
  2049. XMASSERT(pSource);
  2050. // Splat the color in all four entries
  2051. XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  2052. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  2053. vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
  2054. // a is unsigned! Flip the bit to convert the order to signed
  2055. vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
  2056. // Convert to floating point numbers
  2057. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  2058. // RGB + 0, A + 0x80000000.f to undo the signed order.
  2059. vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
  2060. // Convert 0-255 to 0.0f-1.0f
  2061. vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
  2062. return vTemp;
  2063. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2064. #endif // _XM_VMX128_INTRINSICS_
  2065. }
  2066. //------------------------------------------------------------------------------
  2067. XMFINLINE XMVECTOR XMLoadUByteN4
  2068. (
  2069. CONST XMUBYTEN4* pSource
  2070. )
  2071. {
  2072. #if defined(_XM_NO_INTRINSICS_)
  2073. XMVECTOR V;
  2074. XMASSERT(pSource);
  2075. V.vector4_f32[0] = (FLOAT)pSource->x / 255.0f;
  2076. V.vector4_f32[1] = (FLOAT)pSource->y / 255.0f;
  2077. V.vector4_f32[2] = (FLOAT)pSource->z / 255.0f;
  2078. V.vector4_f32[3] = (FLOAT)pSource->w / 255.0f;
  2079. return V;
  2080. #elif defined(_XM_SSE_INTRINSICS_)
  2081. static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)};
  2082. XMASSERT(pSource);
  2083. // Splat the color in all four entries (x,z,y,w)
  2084. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  2085. // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
  2086. vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
  2087. // w is signed! Flip the bits to convert the order to unsigned
  2088. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  2089. // Convert to floating point numbers
  2090. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  2091. // w + 0x80 to complete the conversion
  2092. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  2093. // Fix y, z and w because they are too large
  2094. vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
  2095. return vTemp;
  2096. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2097. #endif // _XM_VMX128_INTRINSICS_
  2098. }
  2099. //------------------------------------------------------------------------------
  2100. XMFINLINE XMVECTOR XMLoadUByte4
  2101. (
  2102. CONST XMUBYTE4* pSource
  2103. )
  2104. {
  2105. #if defined(_XM_NO_INTRINSICS_)
  2106. XMVECTOR V;
  2107. XMASSERT(pSource);
  2108. V.vector4_f32[0] = (FLOAT)pSource->x;
  2109. V.vector4_f32[1] = (FLOAT)pSource->y;
  2110. V.vector4_f32[2] = (FLOAT)pSource->z;
  2111. V.vector4_f32[3] = (FLOAT)pSource->w;
  2112. return V;
  2113. #elif defined(_XM_SSE_INTRINSICS_)
  2114. static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
  2115. XMASSERT(pSource);
  2116. // Splat the color in all four entries (x,z,y,w)
  2117. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  2118. // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
  2119. vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
  2120. // w is signed! Flip the bits to convert the order to unsigned
  2121. vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
  2122. // Convert to floating point numbers
  2123. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  2124. // w + 0x80 to complete the conversion
  2125. vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
  2126. // Fix y, z and w because they are too large
  2127. vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
  2128. return vTemp;
  2129. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2130. #endif // _XM_VMX128_INTRINSICS_
  2131. }
  2132. //------------------------------------------------------------------------------
  2133. XMFINLINE XMVECTOR XMLoadByteN4
  2134. (
  2135. CONST XMBYTEN4* pSource
  2136. )
  2137. {
  2138. #if defined(_XM_NO_INTRINSICS_)
  2139. XMVECTOR V;
  2140. XMASSERT(pSource);
  2141. XMASSERT(pSource->x != -128);
  2142. XMASSERT(pSource->y != -128);
  2143. XMASSERT(pSource->z != -128);
  2144. XMASSERT(pSource->w != -128);
  2145. V.vector4_f32[0] = (FLOAT)pSource->x / 127.0f;
  2146. V.vector4_f32[1] = (FLOAT)pSource->y / 127.0f;
  2147. V.vector4_f32[2] = (FLOAT)pSource->z / 127.0f;
  2148. V.vector4_f32[3] = (FLOAT)pSource->w / 127.0f;
  2149. return V;
  2150. #elif defined(_XM_SSE_INTRINSICS_)
  2151. static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)};
  2152. XMASSERT(pSource);
  2153. XMASSERT(pSource->x != -128);
  2154. XMASSERT(pSource->y != -128);
  2155. XMASSERT(pSource->z != -128);
  2156. XMASSERT(pSource->w != -128);
  2157. // Splat the color in all four entries (x,z,y,w)
  2158. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  2159. // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
  2160. vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
  2161. // x,y and z are unsigned! Flip the bits to convert the order to signed
  2162. vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
  2163. // Convert to floating point numbers
  2164. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  2165. // x, y and z - 0x80 to complete the conversion
  2166. vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
  2167. // Fix y, z and w because they are too large
  2168. vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
  2169. return vTemp;
  2170. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2171. #endif // _XM_VMX128_INTRINSICS_
  2172. }
  2173. //------------------------------------------------------------------------------
  2174. XMFINLINE XMVECTOR XMLoadByte4
  2175. (
  2176. CONST XMBYTE4* pSource
  2177. )
  2178. {
  2179. #if defined(_XM_NO_INTRINSICS_)
  2180. XMVECTOR V;
  2181. XMASSERT(pSource);
  2182. XMASSERT(pSource->x != -128);
  2183. XMASSERT(pSource->y != -128);
  2184. XMASSERT(pSource->z != -128);
  2185. XMASSERT(pSource->w != -128);
  2186. V.vector4_f32[0] = (FLOAT)pSource->x;
  2187. V.vector4_f32[1] = (FLOAT)pSource->y;
  2188. V.vector4_f32[2] = (FLOAT)pSource->z;
  2189. V.vector4_f32[3] = (FLOAT)pSource->w;
  2190. return V;
  2191. #elif defined(_XM_SSE_INTRINSICS_)
  2192. static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
  2193. XMASSERT(pSource);
  2194. XMASSERT(pSource->x != -128);
  2195. XMASSERT(pSource->y != -128);
  2196. XMASSERT(pSource->z != -128);
  2197. XMASSERT(pSource->w != -128);
  2198. // Splat the color in all four entries (x,z,y,w)
  2199. XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
  2200. // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
  2201. vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
  2202. // x,y and z are unsigned! Flip the bits to convert the order to signed
  2203. vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
  2204. // Convert to floating point numbers
  2205. vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
  2206. // x, y and z - 0x80 to complete the conversion
  2207. vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
  2208. // Fix y, z and w because they are too large
  2209. vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
  2210. return vTemp;
  2211. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2212. #endif // _XM_VMX128_INTRINSICS_
  2213. }
  2214. //------------------------------------------------------------------------------
  2215. XMFINLINE XMVECTOR XMLoadUNibble4
  2216. (
  2217. CONST XMUNIBBLE4* pSource
  2218. )
  2219. {
  2220. #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  2221. static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
  2222. static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
  2223. XMASSERT(pSource);
  2224. // Get the 32 bit value and splat it
  2225. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  2226. // Mask off x, y and z
  2227. vResult = _mm_and_ps(vResult,UNibble4And);
  2228. // Convert to float
  2229. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  2230. // Normalize x, y, and z
  2231. vResult = _mm_mul_ps(vResult,UNibble4Mul);
  2232. return vResult;
  2233. #else
  2234. XMVECTOR V;
  2235. UINT Element;
  2236. XMASSERT(pSource);
  2237. Element = pSource->v & 0xF;
  2238. V.vector4_f32[0] = (FLOAT)Element;
  2239. Element = (pSource->v >> 4) & 0xF;
  2240. V.vector4_f32[1] = (FLOAT)Element;
  2241. Element = (pSource->v >> 8) & 0xF;
  2242. V.vector4_f32[2] = (FLOAT)Element;
  2243. Element = (pSource->v >> 12) & 0xF;
  2244. V.vector4_f32[3] = (FLOAT)Element;
  2245. return V;
  2246. #endif // !_XM_SSE_INTRISICS_
  2247. }
  2248. //------------------------------------------------------------------------------
  2249. XMFINLINE XMVECTOR XMLoadU555
  2250. (
  2251. CONST XMU555* pSource
  2252. )
  2253. {
  2254. #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  2255. static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
  2256. static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
  2257. XMASSERT(pSource);
  2258. // Get the 32 bit value and splat it
  2259. XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
  2260. // Mask off x, y and z
  2261. vResult = _mm_and_ps(vResult,U555And);
  2262. // Convert to float
  2263. vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
  2264. // Normalize x, y, and z
  2265. vResult = _mm_mul_ps(vResult,U555Mul);
  2266. return vResult;
  2267. #else
  2268. XMVECTOR V;
  2269. UINT Element;
  2270. XMASSERT(pSource);
  2271. Element = pSource->v & 0x1F;
  2272. V.vector4_f32[0] = (FLOAT)Element;
  2273. Element = (pSource->v >> 5) & 0x1F;
  2274. V.vector4_f32[1] = (FLOAT)Element;
  2275. Element = (pSource->v >> 10) & 0x1F;
  2276. V.vector4_f32[2] = (FLOAT)Element;
  2277. Element = (pSource->v >> 15) & 0x1;
  2278. V.vector4_f32[3] = (FLOAT)Element;
  2279. return V;
  2280. #endif // !_XM_SSE_INTRISICS_
  2281. }
  2282. //------------------------------------------------------------------------------
  2283. XMFINLINE XMVECTOR XMLoadColor
  2284. (
  2285. CONST XMCOLOR* pSource
  2286. )
  2287. {
  2288. #if defined(_XM_NO_INTRINSICS_)
  2289. XMASSERT(pSource);
  2290. {
  2291. // INT -> Float conversions are done in one instruction.
  2292. // UINT -> Float calls a runtime function. Keep in INT
  2293. INT iColor = (INT)(pSource->c);
  2294. XMVECTOR vColor = {
  2295. (FLOAT)((iColor >> 16) & 0xFF) * (1.0f/255.0f),
  2296. (FLOAT)((iColor >> 8) & 0xFF) * (1.0f/255.0f),
  2297. (FLOAT)(iColor & 0xFF) * (1.0f/255.0f),
  2298. (FLOAT)((iColor >> 24) & 0xFF) * (1.0f/255.0f)
  2299. };
  2300. return vColor;
  2301. }
  2302. #elif defined(_XM_SSE_INTRINSICS_)
  2303. XMASSERT(pSource);
  2304. // Splat the color in all four entries
  2305. __m128i vInt = _mm_set1_epi32(pSource->c);
  2306. // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
  2307. vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
  2308. // a is unsigned! Flip the bit to convert the order to signed
  2309. vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
  2310. // Convert to floating point numbers
  2311. XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
  2312. // RGB + 0, A + 0x80000000.f to undo the signed order.
  2313. vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
  2314. // Convert 0-255 to 0.0f-1.0f
  2315. return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
  2316. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2317. #endif // _XM_VMX128_INTRINSICS_
  2318. }
  2319. //------------------------------------------------------------------------------
  2320. XMFINLINE XMMATRIX XMLoadFloat3x3
  2321. (
  2322. CONST XMFLOAT3X3* pSource
  2323. )
  2324. {
  2325. #if defined(_XM_NO_INTRINSICS_)
  2326. XMMATRIX M;
  2327. XMASSERT(pSource);
  2328. M.r[0].vector4_f32[0] = pSource->m[0][0];
  2329. M.r[0].vector4_f32[1] = pSource->m[0][1];
  2330. M.r[0].vector4_f32[2] = pSource->m[0][2];
  2331. M.r[0].vector4_f32[3] = 0.0f;
  2332. M.r[1].vector4_f32[0] = pSource->m[1][0];
  2333. M.r[1].vector4_f32[1] = pSource->m[1][1];
  2334. M.r[1].vector4_f32[2] = pSource->m[1][2];
  2335. M.r[1].vector4_f32[3] = 0.0f;
  2336. M.r[2].vector4_f32[0] = pSource->m[2][0];
  2337. M.r[2].vector4_f32[1] = pSource->m[2][1];
  2338. M.r[2].vector4_f32[2] = pSource->m[2][2];
  2339. M.r[2].vector4_f32[3] = 0.0f;
  2340. M.r[3].vector4_f32[0] = 0.0f;
  2341. M.r[3].vector4_f32[1] = 0.0f;
  2342. M.r[3].vector4_f32[2] = 0.0f;
  2343. M.r[3].vector4_f32[3] = 1.0f;
  2344. return M;
  2345. #elif defined(_XM_SSE_INTRINSICS_)
  2346. XMMATRIX M;
  2347. XMVECTOR V1, V2, V3, Z, T1, T2, T3, T4, T5;
  2348. Z = _mm_setzero_ps();
  2349. XMASSERT(pSource);
  2350. V1 = _mm_loadu_ps( &pSource->m[0][0] );
  2351. V2 = _mm_loadu_ps( &pSource->m[1][1] );
  2352. V3 = _mm_load_ss( &pSource->m[2][2] );
  2353. T1 = _mm_unpackhi_ps( V1, Z );
  2354. T2 = _mm_unpacklo_ps( V2, Z );
  2355. T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
  2356. T4 = _mm_movehl_ps( T2, T3 );
  2357. T5 = _mm_movehl_ps( Z, T1 );
  2358. M.r[0] = _mm_movelh_ps( V1, T1 );
  2359. M.r[1] = _mm_add_ps( T4, T5 );
  2360. M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
  2361. M.r[3] = g_XMIdentityR3;
  2362. return M;
  2363. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2364. #endif // _XM_VMX128_INTRINSICS_
  2365. }
  2366. //------------------------------------------------------------------------------
  2367. XMFINLINE XMMATRIX XMLoadFloat4x3
  2368. (
  2369. CONST XMFLOAT4X3* pSource
  2370. )
  2371. {
  2372. #if defined(_XM_NO_INTRINSICS_)
  2373. XMMATRIX M;
  2374. XMASSERT(pSource);
  2375. ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
  2376. ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
  2377. ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
  2378. M.r[0].vector4_f32[3] = 0.0f;
  2379. ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
  2380. ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
  2381. ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
  2382. M.r[1].vector4_f32[3] = 0.0f;
  2383. ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
  2384. ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
  2385. ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
  2386. M.r[2].vector4_f32[3] = 0.0f;
  2387. ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
  2388. ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
  2389. ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
  2390. M.r[3].vector4_f32[3] = 1.0f;
  2391. return M;
  2392. #elif defined(_XM_SSE_INTRINSICS_)
  2393. XMASSERT(pSource);
  2394. // Use unaligned load instructions to
  2395. // load the 12 floats
  2396. // vTemp1 = x1,y1,z1,x2
  2397. XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
  2398. // vTemp2 = y2,z2,x3,y3
  2399. XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
  2400. // vTemp4 = z3,x4,y4,z4
  2401. XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
  2402. // vTemp3 = x3,y3,z3,z3
  2403. XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
  2404. // vTemp2 = y2,z2,x2,x2
  2405. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
  2406. // vTemp2 = x2,y2,z2,z2
  2407. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
  2408. // vTemp1 = x1,y1,z1,0
  2409. vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
  2410. // vTemp2 = x2,y2,z2,0
  2411. vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
  2412. // vTemp3 = x3,y3,z3,0
  2413. vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
  2414. // vTemp4i = x4,y4,z4,0
  2415. __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
  2416. // vTemp4i = x4,y4,z4,1.0f
  2417. vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
  2418. XMMATRIX M(vTemp1,
  2419. vTemp2,
  2420. vTemp3,
  2421. reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
  2422. return M;
  2423. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2424. #endif // _XM_VMX128_INTRINSICS_
  2425. }
  2426. //------------------------------------------------------------------------------
  2427. XMFINLINE XMMATRIX XMLoadFloat4x3A
  2428. (
  2429. CONST XMFLOAT4X3A* pSource
  2430. )
  2431. {
  2432. #if defined(_XM_NO_INTRINSICS_)
  2433. XMMATRIX M;
  2434. XMASSERT(pSource);
  2435. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  2436. M.r[0].vector4_f32[0] = pSource->m[0][0];
  2437. M.r[0].vector4_f32[1] = pSource->m[0][1];
  2438. M.r[0].vector4_f32[2] = pSource->m[0][2];
  2439. M.r[0].vector4_f32[3] = 0.0f;
  2440. M.r[1].vector4_f32[0] = pSource->m[1][0];
  2441. M.r[1].vector4_f32[1] = pSource->m[1][1];
  2442. M.r[1].vector4_f32[2] = pSource->m[1][2];
  2443. M.r[1].vector4_f32[3] = 0.0f;
  2444. M.r[2].vector4_f32[0] = pSource->m[2][0];
  2445. M.r[2].vector4_f32[1] = pSource->m[2][1];
  2446. M.r[2].vector4_f32[2] = pSource->m[2][2];
  2447. M.r[2].vector4_f32[3] = 0.0f;
  2448. M.r[3].vector4_f32[0] = pSource->m[3][0];
  2449. M.r[3].vector4_f32[1] = pSource->m[3][1];
  2450. M.r[3].vector4_f32[2] = pSource->m[3][2];
  2451. M.r[3].vector4_f32[3] = 1.0f;
  2452. return M;
  2453. #elif defined(_XM_SSE_INTRINSICS_)
  2454. XMASSERT(pSource);
  2455. // Use aligned load instructions to
  2456. // load the 12 floats
  2457. // vTemp1 = x1,y1,z1,x2
  2458. XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
  2459. // vTemp2 = y2,z2,x3,y3
  2460. XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
  2461. // vTemp4 = z3,x4,y4,z4
  2462. XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
  2463. // vTemp3 = x3,y3,z3,z3
  2464. XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
  2465. // vTemp2 = y2,z2,x2,x2
  2466. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
  2467. // vTemp2 = x2,y2,z2,z2
  2468. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
  2469. // vTemp1 = x1,y1,z1,0
  2470. vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
  2471. // vTemp2 = x2,y2,z2,0
  2472. vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
  2473. // vTemp3 = x3,y3,z3,0
  2474. vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
  2475. // vTemp4i = x4,y4,z4,0
  2476. __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
  2477. // vTemp4i = x4,y4,z4,1.0f
  2478. vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
  2479. XMMATRIX M(vTemp1,
  2480. vTemp2,
  2481. vTemp3,
  2482. reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
  2483. return M;
  2484. #else // _XM_VMX128_INTRINSICS_
  2485. #endif // _XM_VMX128_INTRINSICS_
  2486. }
  2487. //------------------------------------------------------------------------------
  2488. XMFINLINE XMMATRIX XMLoadFloat4x4
  2489. (
  2490. CONST XMFLOAT4X4* pSource
  2491. )
  2492. {
  2493. #if defined(_XM_NO_INTRINSICS_)
  2494. XMMATRIX M;
  2495. XMASSERT(pSource);
  2496. ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
  2497. ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
  2498. ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
  2499. ((UINT *)(&M.r[0].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[0][3]))[0];
  2500. ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
  2501. ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
  2502. ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
  2503. ((UINT *)(&M.r[1].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[1][3]))[0];
  2504. ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
  2505. ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
  2506. ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
  2507. ((UINT *)(&M.r[2].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[2][3]))[0];
  2508. ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
  2509. ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
  2510. ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
  2511. ((UINT *)(&M.r[3].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[3][3]))[0];
  2512. return M;
  2513. #elif defined(_XM_SSE_INTRINSICS_)
  2514. XMASSERT(pSource);
  2515. XMMATRIX M;
  2516. M.r[0] = _mm_loadu_ps( &pSource->_11 );
  2517. M.r[1] = _mm_loadu_ps( &pSource->_21 );
  2518. M.r[2] = _mm_loadu_ps( &pSource->_31 );
  2519. M.r[3] = _mm_loadu_ps( &pSource->_41 );
  2520. return M;
  2521. #elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  2522. #endif // _XM_VMX128_INTRINSICS_
  2523. }
  2524. //------------------------------------------------------------------------------
  2525. XMFINLINE XMMATRIX XMLoadFloat4x4A
  2526. (
  2527. CONST XMFLOAT4X4A* pSource
  2528. )
  2529. {
  2530. #if defined(_XM_NO_INTRINSICS_)
  2531. XMMATRIX M;
  2532. XMASSERT(pSource);
  2533. XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
  2534. M.r[0].vector4_f32[0] = pSource->m[0][0];
  2535. M.r[0].vector4_f32[1] = pSource->m[0][1];
  2536. M.r[0].vector4_f32[2] = pSource->m[0][2];
  2537. M.r[0].vector4_f32[3] = pSource->m[0][3];
  2538. M.r[1].vector4_f32[0] = pSource->m[1][0];
  2539. M.r[1].vector4_f32[1] = pSource->m[1][1];
  2540. M.r[1].vector4_f32[2] = pSource->m[1][2];
  2541. M.r[1].vector4_f32[3] = pSource->m[1][3];
  2542. M.r[2].vector4_f32[0] = pSource->m[2][0];
  2543. M.r[2].vector4_f32[1] = pSource->m[2][1];
  2544. M.r[2].vector4_f32[2] = pSource->m[2][2];
  2545. M.r[2].vector4_f32[3] = pSource->m[2][3];
  2546. M.r[3].vector4_f32[0] = pSource->m[3][0];
  2547. M.r[3].vector4_f32[1] = pSource->m[3][1];
  2548. M.r[3].vector4_f32[2] = pSource->m[3][2];
  2549. M.r[3].vector4_f32[3] = pSource->m[3][3];
  2550. return M;
  2551. #elif defined(_XM_SSE_INTRINSICS_)
  2552. XMMATRIX M;
  2553. XMASSERT(pSource);
  2554. M.r[0] = _mm_load_ps( &pSource->_11 );
  2555. M.r[1] = _mm_load_ps( &pSource->_21 );
  2556. M.r[2] = _mm_load_ps( &pSource->_31 );
  2557. M.r[3] = _mm_load_ps( &pSource->_41 );
  2558. return M;
  2559. #else // _XM_VMX128_INTRINSICS_
  2560. #endif // _XM_VMX128_INTRINSICS_
  2561. }
  2562. /****************************************************************************
  2563. *
  2564. * Vector and matrix store operations
  2565. *
  2566. ****************************************************************************/
  2567. XMFINLINE VOID XMStoreInt
  2568. (
  2569. UINT* pDestination,
  2570. FXMVECTOR V
  2571. )
  2572. {
  2573. #if defined(_XM_NO_INTRINSICS_)
  2574. XMASSERT(pDestination);
  2575. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2576. *pDestination = XMVectorGetIntX( V );
  2577. #elif defined(_XM_SSE_INTRINSICS_)
  2578. XMASSERT(pDestination);
  2579. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2580. _mm_store_ss( (float*)pDestination, V );
  2581. #else // _XM_VMX128_INTRINSICS_
  2582. #endif // _XM_VMX128_INTRINSICS_
  2583. }
  2584. //------------------------------------------------------------------------------
  2585. XMFINLINE VOID XMStoreFloat
  2586. (
  2587. FLOAT* pDestination,
  2588. FXMVECTOR V
  2589. )
  2590. {
  2591. #if defined(_XM_NO_INTRINSICS_)
  2592. XMASSERT(pDestination);
  2593. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2594. *pDestination = XMVectorGetX( V );
  2595. #elif defined(_XM_SSE_INTRINSICS_)
  2596. XMASSERT(pDestination);
  2597. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2598. _mm_store_ss( pDestination, V );
  2599. #else // _XM_VMX128_INTRINSICS_
  2600. #endif // _XM_VMX128_INTRINSICS_
  2601. }
  2602. //------------------------------------------------------------------------------
  2603. XMFINLINE VOID XMStoreInt2
  2604. (
  2605. UINT* pDestination,
  2606. FXMVECTOR V
  2607. )
  2608. {
  2609. #if defined(_XM_NO_INTRINSICS_)
  2610. XMASSERT(pDestination);
  2611. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2612. pDestination[0] = V.vector4_u32[0];
  2613. pDestination[1] = V.vector4_u32[1];
  2614. #elif defined(_XM_SSE_INTRINSICS_)
  2615. XMASSERT(pDestination);
  2616. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2617. XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
  2618. _mm_store_ss( (float*)&pDestination[0], V );
  2619. _mm_store_ss( (float*)&pDestination[1], T );
  2620. #else // _XM_VMX128_INTRINSICS_
  2621. #endif // _XM_VMX128_INTRINSICS_
  2622. }
  2623. //------------------------------------------------------------------------------
  2624. XMFINLINE VOID XMStoreInt2A
  2625. (
  2626. UINT* pDestination,
  2627. FXMVECTOR V
  2628. )
  2629. {
  2630. #if defined(_XM_NO_INTRINSICS_)
  2631. XMASSERT(pDestination);
  2632. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  2633. pDestination[0] = V.vector4_u32[0];
  2634. pDestination[1] = V.vector4_u32[1];
  2635. #elif defined(_XM_SSE_INTRINSICS_)
  2636. XMASSERT(pDestination);
  2637. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  2638. _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
  2639. #else // _XM_VMX128_INTRINSICS_
  2640. #endif // _XM_VMX128_INTRINSICS_
  2641. }
  2642. //------------------------------------------------------------------------------
  2643. XMFINLINE VOID XMStoreFloat2
  2644. (
  2645. XMFLOAT2* pDestination,
  2646. FXMVECTOR V
  2647. )
  2648. {
  2649. #if defined(_XM_NO_INTRINSICS_)
  2650. XMASSERT(pDestination);
  2651. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2652. pDestination->x = V.vector4_f32[0];
  2653. pDestination->y = V.vector4_f32[1];
  2654. #elif defined(_XM_SSE_INTRINSICS_)
  2655. XMASSERT(pDestination);
  2656. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2657. XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
  2658. _mm_store_ss( &pDestination->x, V );
  2659. _mm_store_ss( &pDestination->y, T );
  2660. #else // _XM_VMX128_INTRINSICS_
  2661. #endif // _XM_VMX128_INTRINSICS_
  2662. }
  2663. //------------------------------------------------------------------------------
  2664. XMFINLINE VOID XMStoreFloat2A
  2665. (
  2666. XMFLOAT2A* pDestination,
  2667. FXMVECTOR V
  2668. )
  2669. {
  2670. #if defined(_XM_NO_INTRINSICS_)
  2671. XMASSERT(pDestination);
  2672. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  2673. pDestination->x = V.vector4_f32[0];
  2674. pDestination->y = V.vector4_f32[1];
  2675. #elif defined(_XM_SSE_INTRINSICS_)
  2676. XMASSERT(pDestination);
  2677. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  2678. _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
  2679. #else // _XM_VMX128_INTRINSICS_
  2680. #endif // _XM_VMX128_INTRINSICS_
  2681. }
  2682. //------------------------------------------------------------------------------
  2683. XMFINLINE VOID XMStoreHalf2
  2684. (
  2685. XMHALF2* pDestination,
  2686. FXMVECTOR V
  2687. )
  2688. {
  2689. #if defined(_XM_NO_INTRINSICS_)
  2690. XMASSERT(pDestination);
  2691. pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
  2692. pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
  2693. #elif defined(_XM_SSE_INTRINSICS_)
  2694. XMASSERT(pDestination);
  2695. pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
  2696. pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
  2697. #else // _XM_VMX128_INTRINSICS_
  2698. #endif // _XM_VMX128_INTRINSICS_
  2699. }
  2700. //------------------------------------------------------------------------------
  2701. XMFINLINE VOID XMStoreShortN2
  2702. (
  2703. XMSHORTN2* pDestination,
  2704. FXMVECTOR V
  2705. )
  2706. {
  2707. #if defined(_XM_NO_INTRINSICS_)
  2708. XMVECTOR N;
  2709. static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
  2710. XMASSERT(pDestination);
  2711. N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  2712. N = XMVectorMultiply(N, Scale.v);
  2713. N = XMVectorRound(N);
  2714. pDestination->x = (SHORT)N.vector4_f32[0];
  2715. pDestination->y = (SHORT)N.vector4_f32[1];
  2716. #elif defined(_XM_SSE_INTRINSICS_)
  2717. XMASSERT(pDestination);
  2718. static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
  2719. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  2720. vResult = _mm_min_ps(vResult,g_XMOne);
  2721. vResult = _mm_mul_ps(vResult,Scale);
  2722. __m128i vResulti = _mm_cvtps_epi32(vResult);
  2723. vResulti = _mm_packs_epi32(vResulti,vResulti);
  2724. _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  2725. #else // _XM_VMX128_INTRINSICS_
  2726. #endif // _XM_VMX128_INTRINSICS_
  2727. }
  2728. //------------------------------------------------------------------------------
  2729. XMFINLINE VOID XMStoreShort2
  2730. (
  2731. XMSHORT2* pDestination,
  2732. FXMVECTOR V
  2733. )
  2734. {
  2735. #if defined(_XM_NO_INTRINSICS_)
  2736. XMVECTOR N;
  2737. static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
  2738. static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
  2739. XMASSERT(pDestination);
  2740. N = XMVectorClamp(V, Min, Max);
  2741. N = XMVectorRound(N);
  2742. pDestination->x = (SHORT)N.vector4_f32[0];
  2743. pDestination->y = (SHORT)N.vector4_f32[1];
  2744. #elif defined(_XM_SSE_INTRINSICS_)
  2745. XMASSERT(pDestination);
  2746. static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
  2747. static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
  2748. // Bounds check
  2749. XMVECTOR vResult = _mm_max_ps(V,Min);
  2750. vResult = _mm_min_ps(vResult,Max);
  2751. // Convert to int with rounding
  2752. __m128i vInt = _mm_cvtps_epi32(vResult);
  2753. // Pack the ints into shorts
  2754. vInt = _mm_packs_epi32(vInt,vInt);
  2755. _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vInt)[0]);
  2756. #else // _XM_VMX128_INTRINSICS_
  2757. #endif // _XM_VMX128_INTRINSICS_
  2758. }
  2759. //------------------------------------------------------------------------------
  2760. XMFINLINE VOID XMStoreUShortN2
  2761. (
  2762. XMUSHORTN2* pDestination,
  2763. FXMVECTOR V
  2764. )
  2765. {
  2766. #if defined(_XM_NO_INTRINSICS_)
  2767. XMVECTOR N;
  2768. static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
  2769. XMASSERT(pDestination);
  2770. N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
  2771. N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
  2772. N = XMVectorTruncate(N);
  2773. pDestination->x = (SHORT)N.vector4_f32[0];
  2774. pDestination->y = (SHORT)N.vector4_f32[1];
  2775. #elif defined(_XM_SSE_INTRINSICS_)
  2776. XMASSERT(pDestination);
  2777. static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
  2778. // Bounds check
  2779. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2780. vResult = _mm_min_ps(vResult,g_XMOne);
  2781. vResult = _mm_mul_ps(vResult,Scale);
  2782. // Convert to int with rounding
  2783. __m128i vInt = _mm_cvtps_epi32(vResult);
  2784. // Since the SSE pack instruction clamps using signed rules,
  2785. // manually extract the values to store them to memory
  2786. pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
  2787. pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
  2788. #else // _XM_VMX128_INTRINSICS_
  2789. #endif // _XM_VMX128_INTRINSICS_
  2790. }
  2791. //------------------------------------------------------------------------------
  2792. XMFINLINE VOID XMStoreUShort2
  2793. (
  2794. XMUSHORT2* pDestination,
  2795. FXMVECTOR V
  2796. )
  2797. {
  2798. #if defined(_XM_NO_INTRINSICS_)
  2799. XMVECTOR N;
  2800. static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
  2801. XMASSERT(pDestination);
  2802. N = XMVectorClamp(V, XMVectorZero(), Max);
  2803. N = XMVectorRound(N);
  2804. pDestination->x = (SHORT)N.vector4_f32[0];
  2805. pDestination->y = (SHORT)N.vector4_f32[1];
  2806. #elif defined(_XM_SSE_INTRINSICS_)
  2807. XMASSERT(pDestination);
  2808. static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
  2809. // Bounds check
  2810. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2811. vResult = _mm_min_ps(vResult,Max);
  2812. // Convert to int with rounding
  2813. __m128i vInt = _mm_cvtps_epi32(vResult);
  2814. // Since the SSE pack instruction clamps using signed rules,
  2815. // manually extract the values to store them to memory
  2816. pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
  2817. pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
  2818. #else // _XM_VMX128_INTRINSICS_
  2819. #endif // _XM_VMX128_INTRINSICS_
  2820. }
  2821. //------------------------------------------------------------------------------
  2822. XMFINLINE VOID XMStoreInt3
  2823. (
  2824. UINT* pDestination,
  2825. FXMVECTOR V
  2826. )
  2827. {
  2828. #if defined(_XM_NO_INTRINSICS_)
  2829. XMASSERT(pDestination);
  2830. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2831. pDestination[0] = V.vector4_u32[0];
  2832. pDestination[1] = V.vector4_u32[1];
  2833. pDestination[2] = V.vector4_u32[2];
  2834. #elif defined(_XM_SSE_INTRINSICS_)
  2835. XMASSERT(pDestination);
  2836. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2837. XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
  2838. XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
  2839. _mm_store_ss( (float*)pDestination, V );
  2840. _mm_store_ss( (float*)&pDestination[1], T1 );
  2841. _mm_store_ss( (float*)&pDestination[2], T2 );
  2842. #else // _XM_VMX128_INTRINSICS_
  2843. #endif // _XM_VMX128_INTRINSICS_
  2844. }
  2845. //------------------------------------------------------------------------------
  2846. XMFINLINE VOID XMStoreInt3A
  2847. (
  2848. UINT* pDestination,
  2849. FXMVECTOR V
  2850. )
  2851. {
  2852. #if defined(_XM_NO_INTRINSICS_)
  2853. XMASSERT(pDestination);
  2854. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  2855. pDestination[0] = V.vector4_u32[0];
  2856. pDestination[1] = V.vector4_u32[1];
  2857. pDestination[2] = V.vector4_u32[2];
  2858. #elif defined(_XM_SSE_INTRINSICS_)
  2859. XMASSERT(pDestination);
  2860. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  2861. XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
  2862. _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
  2863. _mm_store_ss( (float*)&pDestination[2], T );
  2864. #else // _XM_VMX128_INTRINSICS_
  2865. #endif // _XM_VMX128_INTRINSICS_
  2866. }
  2867. //------------------------------------------------------------------------------
  2868. XMFINLINE VOID XMStoreFloat3
  2869. (
  2870. XMFLOAT3* pDestination,
  2871. FXMVECTOR V
  2872. )
  2873. {
  2874. #if defined(_XM_NO_INTRINSICS_)
  2875. XMASSERT(pDestination);
  2876. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2877. pDestination->x = V.vector4_f32[0];
  2878. pDestination->y = V.vector4_f32[1];
  2879. pDestination->z = V.vector4_f32[2];
  2880. #elif defined(_XM_SSE_INTRINSICS_)
  2881. XMASSERT(pDestination);
  2882. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  2883. XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
  2884. XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
  2885. _mm_store_ss( &pDestination->x, V );
  2886. _mm_store_ss( &pDestination->y, T1 );
  2887. _mm_store_ss( &pDestination->z, T2 );
  2888. #else // _XM_VMX128_INTRINSICS_
  2889. #endif // _XM_VMX128_INTRINSICS_
  2890. }
  2891. //------------------------------------------------------------------------------
  2892. XMFINLINE VOID XMStoreFloat3A
  2893. (
  2894. XMFLOAT3A* pDestination,
  2895. FXMVECTOR V
  2896. )
  2897. {
  2898. #if defined(_XM_NO_INTRINSICS_)
  2899. XMASSERT(pDestination);
  2900. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  2901. pDestination->x = V.vector4_f32[0];
  2902. pDestination->y = V.vector4_f32[1];
  2903. pDestination->z = V.vector4_f32[2];
  2904. #elif defined(_XM_SSE_INTRINSICS_)
  2905. XMASSERT(pDestination);
  2906. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  2907. XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
  2908. _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
  2909. _mm_store_ss( &pDestination->z, T );
  2910. #else // _XM_VMX128_INTRINSICS_
  2911. #endif // _XM_VMX128_INTRINSICS_
  2912. }
  2913. //------------------------------------------------------------------------------
  2914. XMFINLINE VOID XMStoreUHenDN3
  2915. (
  2916. XMUHENDN3* pDestination,
  2917. FXMVECTOR V
  2918. )
  2919. {
  2920. #if defined(_XM_NO_INTRINSICS_)
  2921. XMVECTOR N;
  2922. static CONST XMVECTORF32 Scale = {2047.0f, 2047.0f, 1023.0f, 0.0f};
  2923. XMASSERT(pDestination);
  2924. N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
  2925. N = XMVectorMultiply(N, Scale.v);
  2926. pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
  2927. (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
  2928. (((UINT)N.vector4_f32[0] & 0x7FF));
  2929. #elif defined(_XM_SSE_INTRINSICS_)
  2930. XMASSERT(pDestination);
  2931. static const XMVECTORF32 ScaleUHenDN3 = {2047.0f, 2047.0f*2048.0f,1023.0f*(2048.0f*2048.0f)/2.0f,1.0f};
  2932. static const XMVECTORI32 MaskUHenDN3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
  2933. // Clamp to bounds
  2934. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2935. vResult = _mm_min_ps(vResult,g_XMOne);
  2936. // Scale by multiplication
  2937. vResult = _mm_mul_ps(vResult,ScaleUHenDN3);
  2938. // Convert to int
  2939. __m128i vResulti = _mm_cvttps_epi32(vResult);
  2940. // Mask off any fraction
  2941. vResulti = _mm_and_si128(vResulti,MaskUHenDN3);
  2942. // Do a horizontal or of 3 entries
  2943. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
  2944. // i = x|y
  2945. vResulti = _mm_or_si128(vResulti,vResulti2);
  2946. // Move Z to the x position
  2947. vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
  2948. // Add Z to itself to perform a single bit left shift
  2949. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  2950. // i = x|y|z
  2951. vResulti = _mm_or_si128(vResulti,vResulti2);
  2952. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  2953. #else // _XM_VMX128_INTRINSICS_
  2954. #endif // _XM_VMX128_INTRINSICS_
  2955. }
  2956. //------------------------------------------------------------------------------
  2957. XMFINLINE VOID XMStoreUHenD3
  2958. (
  2959. XMUHEND3* pDestination,
  2960. FXMVECTOR V
  2961. )
  2962. {
  2963. #if defined(_XM_NO_INTRINSICS_)
  2964. XMVECTOR N;
  2965. static CONST XMVECTOR Max = {2047.0f, 2047.0f, 1023.0f, 0.0f};
  2966. XMASSERT(pDestination);
  2967. N = XMVectorClamp(V, XMVectorZero(), Max);
  2968. pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
  2969. (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
  2970. (((UINT)N.vector4_f32[0] & 0x7FF));
  2971. #elif defined(_XM_SSE_INTRINSICS_)
  2972. XMASSERT(pDestination);
  2973. static const XMVECTORF32 MaxUHenD3 = { 2047.0f, 2047.0f, 1023.0f, 1.0f};
  2974. static const XMVECTORF32 ScaleUHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f)/2.0f,1.0f};
  2975. static const XMVECTORI32 MaskUHenD3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
  2976. // Clamp to bounds
  2977. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  2978. vResult = _mm_min_ps(vResult,MaxUHenD3);
  2979. // Scale by multiplication
  2980. vResult = _mm_mul_ps(vResult,ScaleUHenD3);
  2981. // Convert to int
  2982. __m128i vResulti = _mm_cvttps_epi32(vResult);
  2983. // Mask off any fraction
  2984. vResulti = _mm_and_si128(vResulti,MaskUHenD3);
  2985. // Do a horizontal or of 3 entries
  2986. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
  2987. // i = x|y
  2988. vResulti = _mm_or_si128(vResulti,vResulti2);
  2989. // Move Z to the x position
  2990. vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
  2991. // Add Z to itself to perform a single bit left shift
  2992. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  2993. // i = x|y|z
  2994. vResulti = _mm_or_si128(vResulti,vResulti2);
  2995. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  2996. #else // _XM_VMX128_INTRINSICS_
  2997. #endif // _XM_VMX128_INTRINSICS_
  2998. }
  2999. //------------------------------------------------------------------------------
  3000. XMFINLINE VOID XMStoreHenDN3
  3001. (
  3002. XMHENDN3* pDestination,
  3003. FXMVECTOR V
  3004. )
  3005. {
  3006. #if defined(_XM_NO_INTRINSICS_)
  3007. XMVECTOR N;
  3008. static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 511.0f, 1.0f};
  3009. XMASSERT(pDestination);
  3010. N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  3011. N = XMVectorMultiply(N, Scale.v);
  3012. pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
  3013. (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
  3014. (((INT)N.vector4_f32[0] & 0x7FF));
  3015. #elif defined(_XM_SSE_INTRINSICS_)
  3016. XMASSERT(pDestination);
  3017. static const XMVECTORF32 ScaleHenDN3 = {1023.0f, 1023.0f*2048.0f,511.0f*(2048.0f*2048.0f),1.0f};
  3018. // Clamp to bounds
  3019. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  3020. vResult = _mm_min_ps(vResult,g_XMOne);
  3021. // Scale by multiplication
  3022. vResult = _mm_mul_ps(vResult,ScaleHenDN3);
  3023. // Convert to int
  3024. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3025. // Mask off any fraction
  3026. vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
  3027. // Do a horizontal or of all 4 entries
  3028. vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
  3029. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  3030. vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
  3031. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  3032. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  3033. #else // _XM_VMX128_INTRINSICS_
  3034. #endif // _XM_VMX128_INTRINSICS_
  3035. }
  3036. //------------------------------------------------------------------------------
  3037. XMFINLINE VOID XMStoreHenD3
  3038. (
  3039. XMHEND3* pDestination,
  3040. FXMVECTOR V
  3041. )
  3042. {
  3043. #if defined(_XM_NO_INTRINSICS_)
  3044. XMVECTOR N;
  3045. static CONST XMVECTOR Min = {-1023.0f, -1023.0f, -511.0f, -1.0f};
  3046. static CONST XMVECTOR Max = {1023.0f, 1023.0f, 511.0f, 1.0f};
  3047. XMASSERT(pDestination);
  3048. N = XMVectorClamp(V, Min, Max);
  3049. pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
  3050. (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
  3051. (((INT)N.vector4_f32[0] & 0x7FF));
  3052. #elif defined(_XM_SSE_INTRINSICS_)
  3053. XMASSERT(pDestination);
  3054. static const XMVECTORF32 MinHenD3 = {-1023.0f,-1023.0f,-511.0f,-1.0f};
  3055. static const XMVECTORF32 MaxHenD3 = { 1023.0f, 1023.0f, 511.0f, 1.0f};
  3056. static const XMVECTORF32 ScaleHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f),1.0f};
  3057. // Clamp to bounds
  3058. XMVECTOR vResult = _mm_max_ps(V,MinHenD3);
  3059. vResult = _mm_min_ps(vResult,MaxHenD3);
  3060. // Scale by multiplication
  3061. vResult = _mm_mul_ps(vResult,ScaleHenD3);
  3062. // Convert to int
  3063. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3064. // Mask off any fraction
  3065. vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
  3066. // Do a horizontal or of all 4 entries
  3067. vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
  3068. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  3069. vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
  3070. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  3071. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  3072. #else // _XM_VMX128_INTRINSICS_
  3073. #endif // _XM_VMX128_INTRINSICS_
  3074. }
  3075. //------------------------------------------------------------------------------
  3076. XMFINLINE VOID XMStoreUDHenN3
  3077. (
  3078. XMUDHENN3* pDestination,
  3079. FXMVECTOR V
  3080. )
  3081. {
  3082. #if defined(_XM_NO_INTRINSICS_)
  3083. XMVECTOR N;
  3084. static CONST XMVECTORF32 Scale = {1023.0f, 2047.0f, 2047.0f, 0.0f};
  3085. XMASSERT(pDestination);
  3086. N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
  3087. N = XMVectorMultiply(N, Scale.v);
  3088. pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
  3089. (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
  3090. (((UINT)N.vector4_f32[0] & 0x3FF));
  3091. #elif defined(_XM_SSE_INTRINSICS_)
  3092. XMASSERT(pDestination);
  3093. static const XMVECTORF32 ScaleUDHenN3 = {1023.0f,2047.0f*1024.0f,2047.0f*(1024.0f*2048.0f)/2.0f,1.0f};
  3094. static const XMVECTORI32 MaskUDHenN3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
  3095. // Clamp to bounds
  3096. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  3097. vResult = _mm_min_ps(vResult,g_XMOne);
  3098. // Scale by multiplication
  3099. vResult = _mm_mul_ps(vResult,ScaleUDHenN3);
  3100. // Convert to int
  3101. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3102. // Mask off any fraction
  3103. vResulti = _mm_and_si128(vResulti,MaskUDHenN3);
  3104. // Do a horizontal or of 3 entries
  3105. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
  3106. // i = x|y
  3107. vResulti = _mm_or_si128(vResulti,vResulti2);
  3108. // Move Z to the x position
  3109. vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
  3110. // Add Z to itself to perform a single bit left shift
  3111. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  3112. // i = x|y|z
  3113. vResulti = _mm_or_si128(vResulti,vResulti2);
  3114. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  3115. #else // _XM_VMX128_INTRINSICS_
  3116. #endif // _XM_VMX128_INTRINSICS_
  3117. }
  3118. //------------------------------------------------------------------------------
  3119. XMFINLINE VOID XMStoreUDHen3
  3120. (
  3121. XMUDHEN3* pDestination,
  3122. FXMVECTOR V
  3123. )
  3124. {
  3125. #if defined(_XM_NO_INTRINSICS_)
  3126. XMVECTOR N;
  3127. static CONST XMVECTOR Max = {1023.0f, 2047.0f, 2047.0f, 0.0f};
  3128. XMASSERT(pDestination);
  3129. N = XMVectorClamp(V, XMVectorZero(), Max);
  3130. pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
  3131. (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
  3132. (((UINT)N.vector4_f32[0] & 0x3FF));
  3133. #elif defined(_XM_SSE_INTRINSICS_)
  3134. XMASSERT(pDestination);
  3135. static const XMVECTORF32 MaxUDHen3 = { 1023.0f, 2047.0f, 2047.0f, 1.0f};
  3136. static const XMVECTORF32 ScaleUDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f)/2.0f,1.0f};
  3137. static const XMVECTORI32 MaskUDHen3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
  3138. // Clamp to bounds
  3139. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  3140. vResult = _mm_min_ps(vResult,MaxUDHen3);
  3141. // Scale by multiplication
  3142. vResult = _mm_mul_ps(vResult,ScaleUDHen3);
  3143. // Convert to int
  3144. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3145. // Mask off any fraction
  3146. vResulti = _mm_and_si128(vResulti,MaskUDHen3);
  3147. // Do a horizontal or of 3 entries
  3148. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
  3149. // i = x|y
  3150. vResulti = _mm_or_si128(vResulti,vResulti2);
  3151. // Move Z to the x position
  3152. vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
  3153. // Add Z to itself to perform a single bit left shift
  3154. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  3155. // i = x|y|z
  3156. vResulti = _mm_or_si128(vResulti,vResulti2);
  3157. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  3158. #else // _XM_VMX128_INTRINSICS_
  3159. #endif // _XM_VMX128_INTRINSICS_
  3160. }
  3161. //------------------------------------------------------------------------------
  3162. XMFINLINE VOID XMStoreDHenN3
  3163. (
  3164. XMDHENN3* pDestination,
  3165. FXMVECTOR V
  3166. )
  3167. {
  3168. #if defined(_XM_NO_INTRINSICS_)
  3169. XMVECTOR N;
  3170. static CONST XMVECTORF32 Scale = {511.0f, 1023.0f, 1023.0f, 1.0f};
  3171. XMASSERT(pDestination);
  3172. N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  3173. N = XMVectorMultiply(N, Scale.v);
  3174. pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
  3175. (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
  3176. (((INT)N.vector4_f32[0] & 0x3FF));
  3177. #elif defined(_XM_SSE_INTRINSICS_)
  3178. XMASSERT(pDestination);
  3179. static const XMVECTORF32 ScaleDHenN3 = {511.0f, 1023.0f*1024.0f,1023.0f*(1024.0f*2048.0f),1.0f};
  3180. // Clamp to bounds
  3181. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  3182. vResult = _mm_min_ps(vResult,g_XMOne);
  3183. // Scale by multiplication
  3184. vResult = _mm_mul_ps(vResult,ScaleDHenN3);
  3185. // Convert to int
  3186. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3187. // Mask off any fraction
  3188. vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
  3189. // Do a horizontal or of all 4 entries
  3190. vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
  3191. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  3192. vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
  3193. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  3194. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  3195. #else // _XM_VMX128_INTRINSICS_
  3196. #endif // _XM_VMX128_INTRINSICS_
  3197. }
  3198. //------------------------------------------------------------------------------
  3199. XMFINLINE VOID XMStoreDHen3
  3200. (
  3201. XMDHEN3* pDestination,
  3202. FXMVECTOR V
  3203. )
  3204. {
  3205. #if defined(_XM_NO_INTRINSICS_)
  3206. XMVECTOR N;
  3207. static CONST XMVECTOR Min = {-511.0f, -1023.0f, -1023.0f, -1.0f};
  3208. static CONST XMVECTOR Max = {511.0f, 1023.0f, 1023.0f, 1.0f};
  3209. XMASSERT(pDestination);
  3210. N = XMVectorClamp(V, Min, Max);
  3211. pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
  3212. (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
  3213. (((INT)N.vector4_f32[0] & 0x3FF));
  3214. #elif defined(_XM_SSE_INTRINSICS_)
  3215. XMASSERT(pDestination);
  3216. static const XMVECTORF32 MinDHen3 = {-511.0f,-1023.0f,-1023.0f,-1.0f};
  3217. static const XMVECTORF32 MaxDHen3 = { 511.0f, 1023.0f, 1023.0f, 1.0f};
  3218. static const XMVECTORF32 ScaleDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f),1.0f};
  3219. // Clamp to bounds
  3220. XMVECTOR vResult = _mm_max_ps(V,MinDHen3);
  3221. vResult = _mm_min_ps(vResult,MaxDHen3);
  3222. // Scale by multiplication
  3223. vResult = _mm_mul_ps(vResult,ScaleDHen3);
  3224. // Convert to int
  3225. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3226. // Mask off any fraction
  3227. vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
  3228. // Do a horizontal or of all 4 entries
  3229. vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
  3230. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  3231. vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
  3232. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  3233. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  3234. #else // _XM_VMX128_INTRINSICS_
  3235. #endif // _XM_VMX128_INTRINSICS_
  3236. }
  3237. //------------------------------------------------------------------------------
  3238. XMFINLINE VOID XMStoreU565
  3239. (
  3240. XMU565* pDestination,
  3241. FXMVECTOR V
  3242. )
  3243. {
  3244. #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  3245. XMASSERT(pDestination);
  3246. static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
  3247. // Bounds check
  3248. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  3249. vResult = _mm_min_ps(vResult,Max);
  3250. // Convert to int with rounding
  3251. __m128i vInt = _mm_cvtps_epi32(vResult);
  3252. // No SSE operations will write to 16-bit values, so we have to extract them manually
  3253. USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
  3254. USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
  3255. USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
  3256. pDestination->v = ((z & 0x1F) << 11) |
  3257. ((y & 0x3F) << 5) |
  3258. ((x & 0x1F));
  3259. #else
  3260. XMVECTOR N;
  3261. static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
  3262. XMASSERT(pDestination);
  3263. N = XMVectorClamp(V, XMVectorZero(), Max.v);
  3264. N = XMVectorRound(N);
  3265. pDestination->v = (((USHORT)N.vector4_f32[2] & 0x1F) << 11) |
  3266. (((USHORT)N.vector4_f32[1] & 0x3F) << 5) |
  3267. (((USHORT)N.vector4_f32[0] & 0x1F));
  3268. #endif !_XM_SSE_INTRINSICS_
  3269. }
  3270. //------------------------------------------------------------------------------
  3271. XMFINLINE VOID XMStoreFloat3PK
  3272. (
  3273. XMFLOAT3PK* pDestination,
  3274. FXMVECTOR V
  3275. )
  3276. {
  3277. _DECLSPEC_ALIGN_16_ UINT IValue[4];
  3278. UINT I, Sign, j;
  3279. UINT Result[3];
  3280. XMASSERT(pDestination);
  3281. XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
  3282. // X & Y Channels (5-bit exponent, 6-bit mantissa)
  3283. for(j=0; j < 2; ++j)
  3284. {
  3285. Sign = IValue[j] & 0x80000000;
  3286. I = IValue[j] & 0x7FFFFFFF;
  3287. if ((I & 0x7F800000) == 0x7F800000)
  3288. {
  3289. // INF or NAN
  3290. Result[j] = 0x7c0;
  3291. if (( I & 0x7FFFFF ) != 0)
  3292. {
  3293. Result[j] = 0x7c0 | (((I>>17)|(I>11)|(I>>6)|(I))&0x3f);
  3294. }
  3295. else if ( Sign )
  3296. {
  3297. // -INF is clamped to 0 since 3PK is positive only
  3298. Result[j] = 0;
  3299. }
  3300. }
  3301. else if ( Sign )
  3302. {
  3303. // 3PK is positive only, so clamp to zero
  3304. Result[j] = 0;
  3305. }
  3306. else if (I > 0x477E0000U)
  3307. {
  3308. // The number is too large to be represented as a float11, set to max
  3309. Result[j] = 0x7BF;
  3310. }
  3311. else
  3312. {
  3313. if (I < 0x38800000U)
  3314. {
  3315. // The number is too small to be represented as a normalized float11
  3316. // Convert it to a denormalized value.
  3317. UINT Shift = 113U - (I >> 23U);
  3318. I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
  3319. }
  3320. else
  3321. {
  3322. // Rebias the exponent to represent the value as a normalized float11
  3323. I += 0xC8000000U;
  3324. }
  3325. Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
  3326. }
  3327. }
  3328. // Z Channel (5-bit exponent, 5-bit mantissa)
  3329. Sign = IValue[2] & 0x80000000;
  3330. I = IValue[2] & 0x7FFFFFFF;
  3331. if ((I & 0x7F800000) == 0x7F800000)
  3332. {
  3333. // INF or NAN
  3334. Result[2] = 0x3e0;
  3335. if ( I & 0x7FFFFF )
  3336. {
  3337. Result[2] = 0x3e0 | (((I>>18)|(I>13)|(I>>3)|(I))&0x1f);
  3338. }
  3339. else if ( Sign )
  3340. {
  3341. // -INF is clamped to 0 since 3PK is positive only
  3342. Result[2] = 0;
  3343. }
  3344. }
  3345. else if ( Sign )
  3346. {
  3347. // 3PK is positive only, so clamp to zero
  3348. Result[2] = 0;
  3349. }
  3350. else if (I > 0x477C0000U)
  3351. {
  3352. // The number is too large to be represented as a float10, set to max
  3353. Result[2] = 0x3df;
  3354. }
  3355. else
  3356. {
  3357. if (I < 0x38800000U)
  3358. {
  3359. // The number is too small to be represented as a normalized float10
  3360. // Convert it to a denormalized value.
  3361. UINT Shift = 113U - (I >> 23U);
  3362. I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
  3363. }
  3364. else
  3365. {
  3366. // Rebias the exponent to represent the value as a normalized float10
  3367. I += 0xC8000000U;
  3368. }
  3369. Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
  3370. }
  3371. // Pack Result into memory
  3372. pDestination->v = (Result[0] & 0x7ff)
  3373. | ( (Result[1] & 0x7ff) << 11 )
  3374. | ( (Result[2] & 0x3ff) << 22 );
  3375. }
  3376. //------------------------------------------------------------------------------
  3377. XMFINLINE VOID XMStoreFloat3SE
  3378. (
  3379. XMFLOAT3SE* pDestination,
  3380. FXMVECTOR V
  3381. )
  3382. {
  3383. _DECLSPEC_ALIGN_16_ UINT IValue[4];
  3384. UINT I, Sign, j, T;
  3385. UINT Frac[3];
  3386. UINT Exp[3];
  3387. XMASSERT(pDestination);
  3388. XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
  3389. // X, Y, Z Channels (5-bit exponent, 9-bit mantissa)
  3390. for(j=0; j < 3; ++j)
  3391. {
  3392. Sign = IValue[j] & 0x80000000;
  3393. I = IValue[j] & 0x7FFFFFFF;
  3394. if ((I & 0x7F800000) == 0x7F800000)
  3395. {
  3396. // INF or NAN
  3397. Exp[j] = 0x1f;
  3398. if (( I & 0x7FFFFF ) != 0)
  3399. {
  3400. Frac[j] = ((I>>14)|(I>5)|(I))&0x1ff;
  3401. }
  3402. else if ( Sign )
  3403. {
  3404. // -INF is clamped to 0 since 3SE is positive only
  3405. Exp[j] = Frac[j] = 0;
  3406. }
  3407. }
  3408. else if ( Sign )
  3409. {
  3410. // 3SE is positive only, so clamp to zero
  3411. Exp[j] = Frac[j] = 0;
  3412. }
  3413. else if (I > 0x477FC000U)
  3414. {
  3415. // The number is too large, set to max
  3416. Exp[j] = 0x1e;
  3417. Frac[j] = 0x1ff;
  3418. }
  3419. else
  3420. {
  3421. if (I < 0x38800000U)
  3422. {
  3423. // The number is too small to be represented as a normalized float11
  3424. // Convert it to a denormalized value.
  3425. UINT Shift = 113U - (I >> 23U);
  3426. I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
  3427. }
  3428. else
  3429. {
  3430. // Rebias the exponent to represent the value as a normalized float11
  3431. I += 0xC8000000U;
  3432. }
  3433. T = ((I + 0x1FFFU + ((I >> 14U) & 1U)) >> 14U)&0x3fffU;
  3434. Exp[j] = (T & 0x3E00) >> 9;
  3435. Frac[j] = T & 0x1ff;
  3436. }
  3437. }
  3438. // Adjust to a shared exponent
  3439. T = XMMax( Exp[0], XMMax( Exp[1], Exp[2] ) );
  3440. Frac[0] = Frac[0] >> (T - Exp[0]);
  3441. Frac[1] = Frac[1] >> (T - Exp[1]);
  3442. Frac[2] = Frac[2] >> (T - Exp[2]);
  3443. // Store packed into memory
  3444. pDestination->xm = Frac[0];
  3445. pDestination->ym = Frac[1];
  3446. pDestination->zm = Frac[2];
  3447. pDestination->e = T;
  3448. }
  3449. //------------------------------------------------------------------------------
  3450. XMFINLINE VOID XMStoreInt4
  3451. (
  3452. UINT* pDestination,
  3453. FXMVECTOR V
  3454. )
  3455. {
  3456. #if defined(_XM_NO_INTRINSICS_)
  3457. XMASSERT(pDestination);
  3458. pDestination[0] = V.vector4_u32[0];
  3459. pDestination[1] = V.vector4_u32[1];
  3460. pDestination[2] = V.vector4_u32[2];
  3461. pDestination[3] = V.vector4_u32[3];
  3462. #elif defined(_XM_SSE_INTRINSICS_)
  3463. XMASSERT(pDestination);
  3464. _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
  3465. #else // _XM_VMX128_INTRINSICS_
  3466. #endif // _XM_VMX128_INTRINSICS_
  3467. }
  3468. //------------------------------------------------------------------------------
  3469. XMFINLINE VOID XMStoreInt4A
  3470. (
  3471. UINT* pDestination,
  3472. FXMVECTOR V
  3473. )
  3474. {
  3475. #if defined(_XM_NO_INTRINSICS_)
  3476. XMASSERT(pDestination);
  3477. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  3478. pDestination[0] = V.vector4_u32[0];
  3479. pDestination[1] = V.vector4_u32[1];
  3480. pDestination[2] = V.vector4_u32[2];
  3481. pDestination[3] = V.vector4_u32[3];
  3482. #elif defined(_XM_SSE_INTRINSICS_)
  3483. XMASSERT(pDestination);
  3484. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  3485. _mm_store_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
  3486. #else // _XM_VMX128_INTRINSICS_
  3487. #endif // _XM_VMX128_INTRINSICS_
  3488. }
  3489. //------------------------------------------------------------------------------
  3490. XMFINLINE VOID XMStoreInt4NC
  3491. (
  3492. UINT* pDestination,
  3493. FXMVECTOR V
  3494. )
  3495. {
  3496. #if defined(_XM_NO_INTRINSICS_)
  3497. XMASSERT(pDestination);
  3498. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  3499. pDestination[0] = V.vector4_u32[0];
  3500. pDestination[1] = V.vector4_u32[1];
  3501. pDestination[2] = V.vector4_u32[2];
  3502. pDestination[3] = V.vector4_u32[3];
  3503. #elif defined(_XM_SSE_INTRINSICS_)
  3504. XMASSERT(pDestination);
  3505. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  3506. _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
  3507. #else // _XM_VMX128_INTRINSICS_
  3508. #endif // _XM_VMX128_INTRINSICS_
  3509. }
  3510. //------------------------------------------------------------------------------
  3511. XMFINLINE VOID XMStoreFloat4
  3512. (
  3513. XMFLOAT4* pDestination,
  3514. FXMVECTOR V
  3515. )
  3516. {
  3517. #if defined(_XM_NO_INTRINSICS_)
  3518. XMASSERT(pDestination);
  3519. pDestination->x = V.vector4_f32[0];
  3520. pDestination->y = V.vector4_f32[1];
  3521. pDestination->z = V.vector4_f32[2];
  3522. pDestination->w = V.vector4_f32[3];
  3523. #elif defined(_XM_SSE_INTRINSICS_)
  3524. XMASSERT(pDestination);
  3525. _mm_storeu_ps( &pDestination->x, V );
  3526. #else // _XM_VMX128_INTRINSICS_
  3527. #endif // _XM_VMX128_INTRINSICS_
  3528. }
  3529. //------------------------------------------------------------------------------
  3530. XMFINLINE VOID XMStoreFloat4A
  3531. (
  3532. XMFLOAT4A* pDestination,
  3533. FXMVECTOR V
  3534. )
  3535. {
  3536. #if defined(_XM_NO_INTRINSICS_)
  3537. XMASSERT(pDestination);
  3538. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  3539. pDestination->x = V.vector4_f32[0];
  3540. pDestination->y = V.vector4_f32[1];
  3541. pDestination->z = V.vector4_f32[2];
  3542. pDestination->w = V.vector4_f32[3];
  3543. #elif defined(_XM_SSE_INTRINSICS_)
  3544. XMASSERT(pDestination);
  3545. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  3546. _mm_store_ps( &pDestination->x, V );
  3547. #else // _XM_VMX128_INTRINSICS_
  3548. #endif // _XM_VMX128_INTRINSICS_
  3549. }
  3550. //------------------------------------------------------------------------------
  3551. XMFINLINE VOID XMStoreFloat4NC
  3552. (
  3553. XMFLOAT4* pDestination,
  3554. FXMVECTOR V
  3555. )
  3556. {
  3557. #if defined(_XM_NO_INTRINSICS_)
  3558. XMASSERT(pDestination);
  3559. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  3560. pDestination->x = V.vector4_f32[0];
  3561. pDestination->y = V.vector4_f32[1];
  3562. pDestination->z = V.vector4_f32[2];
  3563. pDestination->w = V.vector4_f32[3];
  3564. #elif defined(_XM_SSE_INTRINSICS_)
  3565. XMASSERT(pDestination);
  3566. XMASSERT(((UINT_PTR)pDestination & 3) == 0);
  3567. _mm_storeu_ps( &pDestination->x, V );
  3568. #else // _XM_VMX128_INTRINSICS_
  3569. #endif // _XM_VMX128_INTRINSICS_
  3570. }
  3571. //------------------------------------------------------------------------------
  3572. XMFINLINE VOID XMStoreHalf4
  3573. (
  3574. XMHALF4* pDestination,
  3575. FXMVECTOR V
  3576. )
  3577. {
  3578. #if defined(_XM_NO_INTRINSICS_)
  3579. XMASSERT(pDestination);
  3580. pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
  3581. pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
  3582. pDestination->z = XMConvertFloatToHalf(V.vector4_f32[2]);
  3583. pDestination->w = XMConvertFloatToHalf(V.vector4_f32[3]);
  3584. #elif defined(_XM_SSE_INTRINSICS_)
  3585. XMASSERT(pDestination);
  3586. pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
  3587. pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
  3588. pDestination->z = XMConvertFloatToHalf(XMVectorGetZ(V));
  3589. pDestination->w = XMConvertFloatToHalf(XMVectorGetW(V));
  3590. #else // _XM_VMX128_INTRINSICS_
  3591. #endif // _XM_VMX128_INTRINSICS_
  3592. }
  3593. //------------------------------------------------------------------------------
  3594. XMFINLINE VOID XMStoreShortN4
  3595. (
  3596. XMSHORTN4* pDestination,
  3597. FXMVECTOR V
  3598. )
  3599. {
  3600. #if defined(_XM_NO_INTRINSICS_)
  3601. XMVECTOR N;
  3602. static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
  3603. XMASSERT(pDestination);
  3604. N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  3605. N = XMVectorMultiply(N, Scale.v);
  3606. N = XMVectorRound(N);
  3607. pDestination->x = (SHORT)N.vector4_f32[0];
  3608. pDestination->y = (SHORT)N.vector4_f32[1];
  3609. pDestination->z = (SHORT)N.vector4_f32[2];
  3610. pDestination->w = (SHORT)N.vector4_f32[3];
  3611. #elif defined(_XM_SSE_INTRINSICS_)
  3612. XMASSERT(pDestination);
  3613. static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
  3614. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  3615. vResult = _mm_min_ps(vResult,g_XMOne);
  3616. vResult = _mm_mul_ps(vResult,Scale);
  3617. __m128i vResulti = _mm_cvtps_epi32(vResult);
  3618. vResulti = _mm_packs_epi32(vResulti,vResulti);
  3619. _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vResulti)[0]);
  3620. #else // _XM_VMX128_INTRINSICS_
  3621. #endif // _XM_VMX128_INTRINSICS_
  3622. }
  3623. //------------------------------------------------------------------------------
  3624. XMFINLINE VOID XMStoreShort4
  3625. (
  3626. XMSHORT4* pDestination,
  3627. FXMVECTOR V
  3628. )
  3629. {
  3630. #if defined(_XM_NO_INTRINSICS_)
  3631. XMVECTOR N;
  3632. static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
  3633. static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
  3634. XMASSERT(pDestination);
  3635. N = XMVectorClamp(V, Min, Max);
  3636. N = XMVectorRound(N);
  3637. pDestination->x = (SHORT)N.vector4_f32[0];
  3638. pDestination->y = (SHORT)N.vector4_f32[1];
  3639. pDestination->z = (SHORT)N.vector4_f32[2];
  3640. pDestination->w = (SHORT)N.vector4_f32[3];
  3641. #elif defined(_XM_SSE_INTRINSICS_)
  3642. XMASSERT(pDestination);
  3643. static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
  3644. static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
  3645. // Bounds check
  3646. XMVECTOR vResult = _mm_max_ps(V,Min);
  3647. vResult = _mm_min_ps(vResult,Max);
  3648. // Convert to int with rounding
  3649. __m128i vInt = _mm_cvtps_epi32(vResult);
  3650. // Pack the ints into shorts
  3651. vInt = _mm_packs_epi32(vInt,vInt);
  3652. _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vInt)[0]);
  3653. #else // _XM_VMX128_INTRINSICS_
  3654. #endif // _XM_VMX128_INTRINSICS_
  3655. }
  3656. //------------------------------------------------------------------------------
  3657. XMFINLINE VOID XMStoreUShortN4
  3658. (
  3659. XMUSHORTN4* pDestination,
  3660. FXMVECTOR V
  3661. )
  3662. {
  3663. #if defined(_XM_NO_INTRINSICS_)
  3664. XMVECTOR N;
  3665. static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
  3666. XMASSERT(pDestination);
  3667. N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
  3668. N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
  3669. N = XMVectorTruncate(N);
  3670. pDestination->x = (SHORT)N.vector4_f32[0];
  3671. pDestination->y = (SHORT)N.vector4_f32[1];
  3672. pDestination->z = (SHORT)N.vector4_f32[2];
  3673. pDestination->w = (SHORT)N.vector4_f32[3];
  3674. #elif defined(_XM_SSE_INTRINSICS_)
  3675. XMASSERT(pDestination);
  3676. static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
  3677. // Bounds check
  3678. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  3679. vResult = _mm_min_ps(vResult,g_XMOne);
  3680. vResult = _mm_mul_ps(vResult,Scale);
  3681. // Convert to int with rounding
  3682. __m128i vInt = _mm_cvtps_epi32(vResult);
  3683. // Since the SSE pack instruction clamps using signed rules,
  3684. // manually extract the values to store them to memory
  3685. pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
  3686. pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
  3687. pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
  3688. pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
  3689. #else // _XM_VMX128_INTRINSICS_
  3690. #endif // _XM_VMX128_INTRINSICS_
  3691. }
  3692. //------------------------------------------------------------------------------
  3693. XMFINLINE VOID XMStoreUShort4
  3694. (
  3695. XMUSHORT4* pDestination,
  3696. FXMVECTOR V
  3697. )
  3698. {
  3699. #if defined(_XM_NO_INTRINSICS_)
  3700. XMVECTOR N;
  3701. static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
  3702. XMASSERT(pDestination);
  3703. N = XMVectorClamp(V, XMVectorZero(), Max);
  3704. N = XMVectorRound(N);
  3705. pDestination->x = (SHORT)N.vector4_f32[0];
  3706. pDestination->y = (SHORT)N.vector4_f32[1];
  3707. pDestination->z = (SHORT)N.vector4_f32[2];
  3708. pDestination->w = (SHORT)N.vector4_f32[3];
  3709. #elif defined(_XM_SSE_INTRINSICS_)
  3710. XMASSERT(pDestination);
  3711. static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
  3712. // Bounds check
  3713. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  3714. vResult = _mm_min_ps(vResult,Max);
  3715. // Convert to int with rounding
  3716. __m128i vInt = _mm_cvtps_epi32(vResult);
  3717. // Since the SSE pack instruction clamps using signed rules,
  3718. // manually extract the values to store them to memory
  3719. pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
  3720. pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
  3721. pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
  3722. pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
  3723. #else // _XM_VMX128_INTRINSICS_
  3724. #endif // _XM_VMX128_INTRINSICS_
  3725. }
  3726. //------------------------------------------------------------------------------
  3727. XMFINLINE VOID XMStoreXIcoN4
  3728. (
  3729. XMXICON4* pDestination,
  3730. FXMVECTOR V
  3731. )
  3732. {
  3733. #if defined(_XM_NO_INTRINSICS_)
  3734. XMVECTOR N;
  3735. static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
  3736. static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 15.0f};
  3737. XMASSERT(pDestination);
  3738. N = XMVectorClamp(V, Min.v, g_XMOne.v);
  3739. N = XMVectorMultiply(N, Scale.v);
  3740. N = XMVectorRound(N);
  3741. pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
  3742. (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
  3743. (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
  3744. (((INT64)N.vector4_f32[0] & 0xFFFFF));
  3745. #elif defined(_XM_SSE_INTRINSICS_)
  3746. XMASSERT(pDestination);
  3747. // Note: Masks are x,w,y and z
  3748. static const XMVECTORF32 MinXIcoN4 = {-1.0f, 0.0f,-1.0f,-1.0f};
  3749. static const XMVECTORF32 ScaleXIcoN4 = {524287.0f,15.0f*4096.0f*65536.0f*0.5f,524287.0f*4096.0f,524287.0f};
  3750. static const XMVECTORI32 MaskXIcoN4 = {0xFFFFF,0xF<<((60-32)-1),0xFFFFF000,0xFFFFF};
  3751. // Clamp to bounds
  3752. XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
  3753. vResult = _mm_max_ps(vResult,MinXIcoN4);
  3754. vResult = _mm_min_ps(vResult,g_XMOne);
  3755. // Scale by multiplication
  3756. vResult = _mm_mul_ps(vResult,ScaleXIcoN4);
  3757. // Convert to integer (w is unsigned)
  3758. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3759. // Mask off unused bits
  3760. vResulti = _mm_and_si128(vResulti,MaskXIcoN4);
  3761. // Isolate Y
  3762. __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
  3763. // Double Y (Really W) to fixup for unsigned conversion
  3764. vResulti = _mm_add_epi32(vResulti,vResulti2);
  3765. // Shift y and z to straddle the 32-bit boundary
  3766. vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
  3767. // Shift it into place
  3768. vResulti2 = _mm_slli_si128(vResulti2,20/8);
  3769. // i = x|y<<20|z<<40|w<<60
  3770. vResulti = _mm_or_si128(vResulti,vResulti2);
  3771. _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
  3772. #else // _XM_VMX128_INTRINSICS_
  3773. #endif // _XM_VMX128_INTRINSICS_
  3774. }
  3775. //------------------------------------------------------------------------------
  3776. XMFINLINE VOID XMStoreXIco4
  3777. (
  3778. XMXICO4* pDestination,
  3779. FXMVECTOR V
  3780. )
  3781. {
  3782. #if defined(_XM_NO_INTRINSICS_)
  3783. XMVECTOR N;
  3784. static CONST XMVECTORF32 Min = {-524287.0f, -524287.0f, -524287.0f, 0.0f};
  3785. static CONST XMVECTORF32 Max = {524287.0f, 524287.0f, 524287.0f, 15.0f};
  3786. XMASSERT(pDestination);
  3787. N = XMVectorClamp(V, Min.v, Max.v);
  3788. pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
  3789. (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
  3790. (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
  3791. (((INT64)N.vector4_f32[0] & 0xFFFFF));
  3792. #elif defined(_XM_SSE_INTRINSICS_)
  3793. XMASSERT(pDestination);
  3794. // Note: Masks are x,w,y and z
  3795. static const XMVECTORF32 MinXIco4 = {-524287.0f, 0.0f,-524287.0f,-524287.0f};
  3796. static const XMVECTORF32 MaxXIco4 = { 524287.0f,15.0f, 524287.0f, 524287.0f};
  3797. static const XMVECTORF32 ScaleXIco4 = {1.0f,4096.0f*65536.0f*0.5f,4096.0f,1.0f};
  3798. static const XMVECTORI32 MaskXIco4 = {0xFFFFF,0xF<<((60-1)-32),0xFFFFF000,0xFFFFF};
  3799. // Clamp to bounds
  3800. XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
  3801. vResult = _mm_max_ps(vResult,MinXIco4);
  3802. vResult = _mm_min_ps(vResult,MaxXIco4);
  3803. // Scale by multiplication
  3804. vResult = _mm_mul_ps(vResult,ScaleXIco4);
  3805. // Convert to int
  3806. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3807. // Mask off any fraction
  3808. vResulti = _mm_and_si128(vResulti,MaskXIco4);
  3809. // Isolate Y
  3810. __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
  3811. // Double Y (Really W) to fixup for unsigned conversion
  3812. vResulti = _mm_add_epi32(vResulti,vResulti2);
  3813. // Shift y and z to straddle the 32-bit boundary
  3814. vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
  3815. // Shift it into place
  3816. vResulti2 = _mm_slli_si128(vResulti2,20/8);
  3817. // i = x|y<<20|z<<40|w<<60
  3818. vResulti = _mm_or_si128(vResulti,vResulti2);
  3819. _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
  3820. #else // _XM_VMX128_INTRINSICS_
  3821. #endif // _XM_VMX128_INTRINSICS_
  3822. }
  3823. //------------------------------------------------------------------------------
  3824. XMFINLINE VOID XMStoreUIcoN4
  3825. (
  3826. XMUICON4* pDestination,
  3827. FXMVECTOR V
  3828. )
  3829. {
  3830. #define XM_URange ((FLOAT)(1 << 20))
  3831. #define XM_URangeDiv2 ((FLOAT)(1 << 19))
  3832. #define XM_UMaxXYZ ((FLOAT)((1 << 20) - 1))
  3833. #define XM_UMaxW ((FLOAT)((1 << 4) - 1))
  3834. #define XM_ScaleXYZ (-(FLOAT)((1 << 20) - 1) / XM_PACK_FACTOR)
  3835. #define XM_ScaleW (-(FLOAT)((1 << 4) - 1) / XM_PACK_FACTOR)
  3836. #define XM_Scale (-1.0f / XM_PACK_FACTOR)
  3837. #define XM_Offset (3.0f)
  3838. #if defined(_XM_NO_INTRINSICS_)
  3839. XMVECTOR N;
  3840. static CONST XMVECTORF32 Scale = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
  3841. XMASSERT(pDestination);
  3842. N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
  3843. N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
  3844. pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
  3845. (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
  3846. (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
  3847. (((UINT64)N.vector4_f32[0] & 0xFFFFF));
  3848. #elif defined(_XM_SSE_INTRINSICS_)
  3849. XMASSERT(pDestination);
  3850. // Note: Masks are x,w,y and z
  3851. static const XMVECTORF32 ScaleUIcoN4 = {1048575.0f,15.0f*4096.0f*65536.0f,1048575.0f*4096.0f,1048575.0f};
  3852. static const XMVECTORI32 MaskUIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
  3853. static const XMVECTORF32 AddUIcoN4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
  3854. // Clamp to bounds
  3855. XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
  3856. vResult = _mm_max_ps(vResult,g_XMZero);
  3857. vResult = _mm_min_ps(vResult,g_XMOne);
  3858. // Scale by multiplication
  3859. vResult = _mm_mul_ps(vResult,ScaleUIcoN4);
  3860. // Adjust for unsigned entries
  3861. vResult = _mm_add_ps(vResult,AddUIcoN4);
  3862. // Convert to int
  3863. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3864. // Fix the signs on the unsigned entries
  3865. vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
  3866. // Mask off any fraction
  3867. vResulti = _mm_and_si128(vResulti,MaskUIcoN4);
  3868. // Shift y and z to straddle the 32-bit boundary
  3869. __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
  3870. // Shift it into place
  3871. vResulti2 = _mm_slli_si128(vResulti2,20/8);
  3872. // i = x|y<<20|z<<40|w<<60
  3873. vResulti = _mm_or_si128(vResulti,vResulti2);
  3874. _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
  3875. #else // _XM_VMX128_INTRINSICS_
  3876. #endif // _XM_VMX128_INTRINSICS_
  3877. #undef XM_URange
  3878. #undef XM_URangeDiv2
  3879. #undef XM_UMaxXYZ
  3880. #undef XM_UMaxW
  3881. #undef XM_ScaleXYZ
  3882. #undef XM_ScaleW
  3883. #undef XM_Scale
  3884. #undef XM_Offset
  3885. }
  3886. //------------------------------------------------------------------------------
  3887. XMFINLINE VOID XMStoreUIco4
  3888. (
  3889. XMUICO4* pDestination,
  3890. FXMVECTOR V
  3891. )
  3892. {
  3893. #define XM_Scale (-1.0f / XM_PACK_FACTOR)
  3894. #define XM_URange ((FLOAT)(1 << 20))
  3895. #define XM_URangeDiv2 ((FLOAT)(1 << 19))
  3896. #if defined(_XM_NO_INTRINSICS_)
  3897. XMVECTOR N;
  3898. static CONST XMVECTOR Max = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
  3899. XMASSERT(pDestination);
  3900. N = XMVectorClamp(V, XMVectorZero(), Max);
  3901. N = XMVectorRound(N);
  3902. pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
  3903. (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
  3904. (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
  3905. (((UINT64)N.vector4_f32[0] & 0xFFFFF));
  3906. #elif defined(_XM_SSE_INTRINSICS_)
  3907. XMASSERT(pDestination);
  3908. // Note: Masks are x,w,y and z
  3909. static const XMVECTORF32 MaxUIco4 = { 1048575.0f, 15.0f, 1048575.0f, 1048575.0f};
  3910. static const XMVECTORF32 ScaleUIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
  3911. static const XMVECTORI32 MaskUIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
  3912. static const XMVECTORF32 AddUIco4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
  3913. // Clamp to bounds
  3914. XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
  3915. vResult = _mm_max_ps(vResult,g_XMZero);
  3916. vResult = _mm_min_ps(vResult,MaxUIco4);
  3917. // Scale by multiplication
  3918. vResult = _mm_mul_ps(vResult,ScaleUIco4);
  3919. vResult = _mm_add_ps(vResult,AddUIco4);
  3920. // Convert to int
  3921. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3922. vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
  3923. // Mask off any fraction
  3924. vResulti = _mm_and_si128(vResulti,MaskUIco4);
  3925. // Shift y and z to straddle the 32-bit boundary
  3926. __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
  3927. // Shift it into place
  3928. vResulti2 = _mm_slli_si128(vResulti2,20/8);
  3929. // i = x|y<<20|z<<40|w<<60
  3930. vResulti = _mm_or_si128(vResulti,vResulti2);
  3931. _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
  3932. #else // _XM_VMX128_INTRINSICS_
  3933. #endif // _XM_VMX128_INTRINSICS_
  3934. #undef XM_Scale
  3935. #undef XM_URange
  3936. #undef XM_URangeDiv2
  3937. }
  3938. //------------------------------------------------------------------------------
  3939. XMFINLINE VOID XMStoreIcoN4
  3940. (
  3941. XMICON4* pDestination,
  3942. FXMVECTOR V
  3943. )
  3944. {
  3945. #define XM_Scale (-1.0f / XM_PACK_FACTOR)
  3946. #define XM_URange ((FLOAT)(1 << 4))
  3947. #define XM_Offset (3.0f)
  3948. #define XM_UMaxXYZ ((FLOAT)((1 << (20 - 1)) - 1))
  3949. #define XM_UMaxW ((FLOAT)((1 << (4 - 1)) - 1))
  3950. #if defined(_XM_NO_INTRINSICS_)
  3951. XMVECTOR N;
  3952. static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 7.0f};
  3953. XMASSERT(pDestination);
  3954. N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  3955. N = XMVectorMultiplyAdd(N, Scale.v, g_XMNegativeZero.v);
  3956. N = XMVectorRound(N);
  3957. pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
  3958. (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
  3959. (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
  3960. (((UINT64)N.vector4_f32[0] & 0xFFFFF));
  3961. #elif defined(_XM_SSE_INTRINSICS_)
  3962. XMASSERT(pDestination);
  3963. // Note: Masks are x,w,y and z
  3964. static const XMVECTORF32 ScaleIcoN4 = {524287.0f,7.0f*4096.0f*65536.0f,524287.0f*4096.0f,524287.0f};
  3965. static const XMVECTORI32 MaskIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
  3966. // Clamp to bounds
  3967. XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
  3968. vResult = _mm_max_ps(vResult,g_XMNegativeOne);
  3969. vResult = _mm_min_ps(vResult,g_XMOne);
  3970. // Scale by multiplication
  3971. vResult = _mm_mul_ps(vResult,ScaleIcoN4);
  3972. // Convert to int
  3973. __m128i vResulti = _mm_cvttps_epi32(vResult);
  3974. // Mask off any fraction
  3975. vResulti = _mm_and_si128(vResulti,MaskIcoN4);
  3976. // Shift y and z to straddle the 32-bit boundary
  3977. __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
  3978. // Shift it into place
  3979. vResulti2 = _mm_slli_si128(vResulti2,20/8);
  3980. // i = x|y<<20|z<<40|w<<60
  3981. vResulti = _mm_or_si128(vResulti,vResulti2);
  3982. _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
  3983. #else // _XM_VMX128_INTRINSICS_
  3984. #endif // _XM_VMX128_INTRINSICS_
  3985. #undef XM_Scale
  3986. #undef XM_URange
  3987. #undef XM_Offset
  3988. #undef XM_UMaxXYZ
  3989. #undef XM_UMaxW
  3990. }
  3991. //------------------------------------------------------------------------------
  3992. XMFINLINE VOID XMStoreIco4
  3993. (
  3994. XMICO4* pDestination,
  3995. FXMVECTOR V
  3996. )
  3997. {
  3998. #define XM_Scale (-1.0f / XM_PACK_FACTOR)
  3999. #define XM_URange ((FLOAT)(1 << 4))
  4000. #define XM_Offset (3.0f)
  4001. #if defined(_XM_NO_INTRINSICS_)
  4002. XMVECTOR N;
  4003. static CONST XMVECTOR Min = {-524287.0f, -524287.0f, -524287.0f, -7.0f};
  4004. static CONST XMVECTOR Max = {524287.0f, 524287.0f, 524287.0f, 7.0f};
  4005. XMASSERT(pDestination);
  4006. N = XMVectorClamp(V, Min, Max);
  4007. N = XMVectorRound(N);
  4008. pDestination->v = ((INT64)N.vector4_f32[3] << 60) |
  4009. (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
  4010. (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
  4011. (((INT64)N.vector4_f32[0] & 0xFFFFF));
  4012. #elif defined(_XM_SSE_INTRINSICS_)
  4013. XMASSERT(pDestination);
  4014. // Note: Masks are x,w,y and z
  4015. static const XMVECTORF32 MinIco4 = {-524287.0f,-7.0f,-524287.0f,-524287.0f};
  4016. static const XMVECTORF32 MaxIco4 = { 524287.0f, 7.0f, 524287.0f, 524287.0f};
  4017. static const XMVECTORF32 ScaleIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
  4018. static const XMVECTORI32 MaskIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
  4019. // Clamp to bounds
  4020. XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
  4021. vResult = _mm_max_ps(vResult,MinIco4);
  4022. vResult = _mm_min_ps(vResult,MaxIco4);
  4023. // Scale by multiplication
  4024. vResult = _mm_mul_ps(vResult,ScaleIco4);
  4025. // Convert to int
  4026. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4027. // Mask off any fraction
  4028. vResulti = _mm_and_si128(vResulti,MaskIco4);
  4029. // Shift y and z to straddle the 32-bit boundary
  4030. __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
  4031. // Shift it into place
  4032. vResulti2 = _mm_slli_si128(vResulti2,20/8);
  4033. // i = x|y<<20|z<<40|w<<60
  4034. vResulti = _mm_or_si128(vResulti,vResulti2);
  4035. _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
  4036. #else // _XM_VMX128_INTRINSICS_
  4037. #endif // _XM_VMX128_INTRINSICS_
  4038. #undef XM_Scale
  4039. #undef XM_URange
  4040. #undef XM_Offset
  4041. }
  4042. //------------------------------------------------------------------------------
  4043. XMFINLINE VOID XMStoreXDecN4
  4044. (
  4045. XMXDECN4* pDestination,
  4046. FXMVECTOR V
  4047. )
  4048. {
  4049. #if defined(_XM_NO_INTRINSICS_)
  4050. XMVECTOR N;
  4051. static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
  4052. static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 3.0f};
  4053. XMASSERT(pDestination);
  4054. N = XMVectorClamp(V, Min.v, g_XMOne.v);
  4055. N = XMVectorMultiply(N, Scale.v);
  4056. N = XMVectorRound(N);
  4057. pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
  4058. (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
  4059. (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
  4060. (((INT)N.vector4_f32[0] & 0x3FF));
  4061. #elif defined(_XM_SSE_INTRINSICS_)
  4062. static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
  4063. static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
  4064. static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
  4065. XMASSERT(pDestination);
  4066. XMVECTOR vResult = _mm_max_ps(V,Min);
  4067. vResult = _mm_min_ps(vResult,g_XMOne);
  4068. // Scale by multiplication
  4069. vResult = _mm_mul_ps(vResult,Scale);
  4070. // Convert to int (W is unsigned)
  4071. __m128i vResulti = _mm_cvtps_epi32(vResult);
  4072. // Mask off any fraction
  4073. vResulti = _mm_and_si128(vResulti,ScaleMask);
  4074. // To fix W, add itself to shift it up to <<30 instead of <<29
  4075. __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
  4076. vResulti = _mm_add_epi32(vResulti,vResultw);
  4077. // Do a horizontal or of all 4 entries
  4078. vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
  4079. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  4080. vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
  4081. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  4082. vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
  4083. vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
  4084. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4085. #else // _XM_VMX128_INTRINSICS_
  4086. #endif // _XM_VMX128_INTRINSICS_
  4087. }
  4088. //------------------------------------------------------------------------------
  4089. XMFINLINE VOID XMStoreXDec4
  4090. (
  4091. XMXDEC4* pDestination,
  4092. FXMVECTOR V
  4093. )
  4094. {
  4095. #if defined(_XM_NO_INTRINSICS_)
  4096. XMVECTOR N;
  4097. static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, 0.0f};
  4098. static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 3.0f};
  4099. XMASSERT(pDestination);
  4100. N = XMVectorClamp(V, Min, Max);
  4101. pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
  4102. (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
  4103. (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
  4104. (((INT)N.vector4_f32[0] & 0x3FF));
  4105. #elif defined(_XM_SSE_INTRINSICS_)
  4106. XMASSERT(pDestination);
  4107. static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
  4108. static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
  4109. static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
  4110. static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
  4111. // Clamp to bounds
  4112. XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
  4113. vResult = _mm_min_ps(vResult,MaxXDec4);
  4114. // Scale by multiplication
  4115. vResult = _mm_mul_ps(vResult,ScaleXDec4);
  4116. // Convert to int
  4117. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4118. // Mask off any fraction
  4119. vResulti = _mm_and_si128(vResulti,MaskXDec4);
  4120. // Do a horizontal or of 4 entries
  4121. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  4122. // x = x|z, y = y|w
  4123. vResulti = _mm_or_si128(vResulti,vResulti2);
  4124. // Move Z to the x position
  4125. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  4126. // Perform a single bit left shift on y|w
  4127. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  4128. // i = x|y|z|w
  4129. vResulti = _mm_or_si128(vResulti,vResulti2);
  4130. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4131. #else // _XM_VMX128_INTRINSICS_
  4132. #endif // _XM_VMX128_INTRINSICS_
  4133. }
  4134. //------------------------------------------------------------------------------
  4135. XMFINLINE VOID XMStoreUDecN4
  4136. (
  4137. XMUDECN4* pDestination,
  4138. FXMVECTOR V
  4139. )
  4140. {
  4141. #if defined(_XM_NO_INTRINSICS_)
  4142. XMVECTOR N;
  4143. static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f};
  4144. XMASSERT(pDestination);
  4145. N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
  4146. N = XMVectorMultiply(N, Scale.v);
  4147. pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
  4148. (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
  4149. (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
  4150. (((UINT)N.vector4_f32[0] & 0x3FF));
  4151. #elif defined(_XM_SSE_INTRINSICS_)
  4152. XMASSERT(pDestination);
  4153. static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
  4154. static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
  4155. // Clamp to bounds
  4156. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  4157. vResult = _mm_min_ps(vResult,g_XMOne);
  4158. // Scale by multiplication
  4159. vResult = _mm_mul_ps(vResult,ScaleUDecN4);
  4160. // Convert to int
  4161. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4162. // Mask off any fraction
  4163. vResulti = _mm_and_si128(vResulti,MaskUDecN4);
  4164. // Do a horizontal or of 4 entries
  4165. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  4166. // x = x|z, y = y|w
  4167. vResulti = _mm_or_si128(vResulti,vResulti2);
  4168. // Move Z to the x position
  4169. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  4170. // Perform a left shift by one bit on y|w
  4171. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  4172. // i = x|y|z|w
  4173. vResulti = _mm_or_si128(vResulti,vResulti2);
  4174. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4175. #else // _XM_VMX128_INTRINSICS_
  4176. #endif // _XM_VMX128_INTRINSICS_
  4177. }
  4178. //------------------------------------------------------------------------------
  4179. XMFINLINE VOID XMStoreUDec4
  4180. (
  4181. XMUDEC4* pDestination,
  4182. FXMVECTOR V
  4183. )
  4184. {
  4185. #if defined(_XM_NO_INTRINSICS_)
  4186. XMVECTOR N;
  4187. static CONST XMVECTOR Max = {1023.0f, 1023.0f, 1023.0f, 3.0f};
  4188. XMASSERT(pDestination);
  4189. N = XMVectorClamp(V, XMVectorZero(), Max);
  4190. pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
  4191. (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
  4192. (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
  4193. (((UINT)N.vector4_f32[0] & 0x3FF));
  4194. #elif defined(_XM_SSE_INTRINSICS_)
  4195. XMASSERT(pDestination);
  4196. static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
  4197. static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
  4198. static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
  4199. // Clamp to bounds
  4200. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  4201. vResult = _mm_min_ps(vResult,MaxUDec4);
  4202. // Scale by multiplication
  4203. vResult = _mm_mul_ps(vResult,ScaleUDec4);
  4204. // Convert to int
  4205. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4206. // Mask off any fraction
  4207. vResulti = _mm_and_si128(vResulti,MaskUDec4);
  4208. // Do a horizontal or of 4 entries
  4209. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  4210. // x = x|z, y = y|w
  4211. vResulti = _mm_or_si128(vResulti,vResulti2);
  4212. // Move Z to the x position
  4213. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  4214. // Perform a left shift by one bit on y|w
  4215. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  4216. // i = x|y|z|w
  4217. vResulti = _mm_or_si128(vResulti,vResulti2);
  4218. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4219. #else // _XM_VMX128_INTRINSICS_
  4220. #endif // _XM_VMX128_INTRINSICS_
  4221. }
  4222. //------------------------------------------------------------------------------
  4223. XMFINLINE VOID XMStoreDecN4
  4224. (
  4225. XMDECN4* pDestination,
  4226. FXMVECTOR V
  4227. )
  4228. {
  4229. #if defined(_XM_NO_INTRINSICS_)
  4230. XMVECTOR N;
  4231. static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 1.0f};
  4232. XMASSERT(pDestination);
  4233. N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
  4234. N = XMVectorMultiply(N, Scale.v);
  4235. pDestination->v = ((INT)N.vector4_f32[3] << 30) |
  4236. (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
  4237. (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
  4238. (((INT)N.vector4_f32[0] & 0x3FF));
  4239. #elif defined(_XM_SSE_INTRINSICS_)
  4240. XMASSERT(pDestination);
  4241. static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
  4242. static const XMVECTORI32 MaskDecN4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
  4243. // Clamp to bounds
  4244. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  4245. vResult = _mm_min_ps(vResult,g_XMOne);
  4246. // Scale by multiplication
  4247. vResult = _mm_mul_ps(vResult,ScaleDecN4);
  4248. // Convert to int
  4249. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4250. // Mask off any fraction
  4251. vResulti = _mm_and_si128(vResulti,MaskDecN4);
  4252. // Do a horizontal or of 4 entries
  4253. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  4254. // x = x|z, y = y|w
  4255. vResulti = _mm_or_si128(vResulti,vResulti2);
  4256. // Move Z to the x position
  4257. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  4258. // i = x|y|z|w
  4259. vResulti = _mm_or_si128(vResulti,vResulti2);
  4260. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4261. #else // _XM_VMX128_INTRINSICS_
  4262. #endif // _XM_VMX128_INTRINSICS_
  4263. }
  4264. //------------------------------------------------------------------------------
  4265. XMFINLINE VOID XMStoreDec4
  4266. (
  4267. XMDEC4* pDestination,
  4268. FXMVECTOR V
  4269. )
  4270. {
  4271. #if defined(_XM_NO_INTRINSICS_)
  4272. XMVECTOR N;
  4273. static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, -1.0f};
  4274. static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 1.0f};
  4275. XMASSERT(pDestination);
  4276. N = XMVectorClamp(V, Min, Max);
  4277. pDestination->v = ((INT)N.vector4_f32[3] << 30) |
  4278. (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
  4279. (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
  4280. (((INT)N.vector4_f32[0] & 0x3FF));
  4281. #elif defined(_XM_SSE_INTRINSICS_)
  4282. XMASSERT(pDestination);
  4283. static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
  4284. static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
  4285. static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
  4286. static const XMVECTORI32 MaskDec4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
  4287. // Clamp to bounds
  4288. XMVECTOR vResult = _mm_max_ps(V,MinDec4);
  4289. vResult = _mm_min_ps(vResult,MaxDec4);
  4290. // Scale by multiplication
  4291. vResult = _mm_mul_ps(vResult,ScaleDec4);
  4292. // Convert to int
  4293. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4294. // Mask off any fraction
  4295. vResulti = _mm_and_si128(vResulti,MaskDec4);
  4296. // Do a horizontal or of 4 entries
  4297. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  4298. // x = x|z, y = y|w
  4299. vResulti = _mm_or_si128(vResulti,vResulti2);
  4300. // Move Z to the x position
  4301. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  4302. // i = x|y|z|w
  4303. vResulti = _mm_or_si128(vResulti,vResulti2);
  4304. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4305. #else // _XM_VMX128_INTRINSICS_
  4306. #endif // _XM_VMX128_INTRINSICS_
  4307. }
  4308. //------------------------------------------------------------------------------
  4309. XMFINLINE VOID XMStoreUByteN4
  4310. (
  4311. XMUBYTEN4* pDestination,
  4312. FXMVECTOR V
  4313. )
  4314. {
  4315. #if defined(_XM_NO_INTRINSICS_)
  4316. XMVECTOR N;
  4317. static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
  4318. XMASSERT(pDestination);
  4319. N = XMVectorSaturate(V);
  4320. N = XMVectorMultiply(N, Scale.v);
  4321. N = XMVectorRound(N);
  4322. pDestination->x = (BYTE)N.vector4_f32[0];
  4323. pDestination->y = (BYTE)N.vector4_f32[1];
  4324. pDestination->z = (BYTE)N.vector4_f32[2];
  4325. pDestination->w = (BYTE)N.vector4_f32[3];
  4326. #elif defined(_XM_SSE_INTRINSICS_)
  4327. XMASSERT(pDestination);
  4328. static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f};
  4329. static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
  4330. // Clamp to bounds
  4331. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  4332. vResult = _mm_min_ps(vResult,g_XMOne);
  4333. // Scale by multiplication
  4334. vResult = _mm_mul_ps(vResult,ScaleUByteN4);
  4335. // Convert to int
  4336. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4337. // Mask off any fraction
  4338. vResulti = _mm_and_si128(vResulti,MaskUByteN4);
  4339. // Do a horizontal or of 4 entries
  4340. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  4341. // x = x|z, y = y|w
  4342. vResulti = _mm_or_si128(vResulti,vResulti2);
  4343. // Move Z to the x position
  4344. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  4345. // Perform a single bit left shift to fix y|w
  4346. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  4347. // i = x|y|z|w
  4348. vResulti = _mm_or_si128(vResulti,vResulti2);
  4349. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4350. #else // _XM_VMX128_INTRINSICS_
  4351. #endif // _XM_VMX128_INTRINSICS_
  4352. }
  4353. //------------------------------------------------------------------------------
  4354. XMFINLINE VOID XMStoreUByte4
  4355. (
  4356. XMUBYTE4* pDestination,
  4357. FXMVECTOR V
  4358. )
  4359. {
  4360. #if defined(_XM_NO_INTRINSICS_)
  4361. XMVECTOR N;
  4362. static CONST XMVECTOR Max = {255.0f, 255.0f, 255.0f, 255.0f};
  4363. XMASSERT(pDestination);
  4364. N = XMVectorClamp(V, XMVectorZero(), Max);
  4365. N = XMVectorRound(N);
  4366. pDestination->x = (BYTE)N.vector4_f32[0];
  4367. pDestination->y = (BYTE)N.vector4_f32[1];
  4368. pDestination->z = (BYTE)N.vector4_f32[2];
  4369. pDestination->w = (BYTE)N.vector4_f32[3];
  4370. #elif defined(_XM_SSE_INTRINSICS_)
  4371. XMASSERT(pDestination);
  4372. static const XMVECTORF32 MaxUByte4 = { 255.0f, 255.0f, 255.0f, 255.0f};
  4373. static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f};
  4374. static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
  4375. // Clamp to bounds
  4376. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  4377. vResult = _mm_min_ps(vResult,MaxUByte4);
  4378. // Scale by multiplication
  4379. vResult = _mm_mul_ps(vResult,ScaleUByte4);
  4380. // Convert to int
  4381. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4382. // Mask off any fraction
  4383. vResulti = _mm_and_si128(vResulti,MaskUByte4);
  4384. // Do a horizontal or of 4 entries
  4385. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  4386. // x = x|z, y = y|w
  4387. vResulti = _mm_or_si128(vResulti,vResulti2);
  4388. // Move Z to the x position
  4389. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  4390. // Perform a single bit left shift to fix y|w
  4391. vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
  4392. // i = x|y|z|w
  4393. vResulti = _mm_or_si128(vResulti,vResulti2);
  4394. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4395. #else // _XM_VMX128_INTRINSICS_
  4396. #endif // _XM_VMX128_INTRINSICS_
  4397. }
  4398. //------------------------------------------------------------------------------
  4399. XMFINLINE VOID XMStoreByteN4
  4400. (
  4401. XMBYTEN4* pDestination,
  4402. FXMVECTOR V
  4403. )
  4404. {
  4405. #if defined(_XM_NO_INTRINSICS_)
  4406. XMVECTOR N;
  4407. static CONST XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f};
  4408. XMASSERT(pDestination);
  4409. N = XMVectorMultiply(V, Scale.v);
  4410. N = XMVectorRound(N);
  4411. pDestination->x = (CHAR)N.vector4_f32[0];
  4412. pDestination->y = (CHAR)N.vector4_f32[1];
  4413. pDestination->z = (CHAR)N.vector4_f32[2];
  4414. pDestination->w = (CHAR)N.vector4_f32[3];
  4415. #elif defined(_XM_SSE_INTRINSICS_)
  4416. XMASSERT(pDestination);
  4417. static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f};
  4418. static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
  4419. // Clamp to bounds
  4420. XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
  4421. vResult = _mm_min_ps(vResult,g_XMOne);
  4422. // Scale by multiplication
  4423. vResult = _mm_mul_ps(vResult,ScaleByteN4);
  4424. // Convert to int
  4425. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4426. // Mask off any fraction
  4427. vResulti = _mm_and_si128(vResulti,MaskByteN4);
  4428. // Do a horizontal or of 4 entries
  4429. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  4430. // x = x|z, y = y|w
  4431. vResulti = _mm_or_si128(vResulti,vResulti2);
  4432. // Move Z to the x position
  4433. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  4434. // i = x|y|z|w
  4435. vResulti = _mm_or_si128(vResulti,vResulti2);
  4436. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4437. #else // _XM_VMX128_INTRINSICS_
  4438. #endif // _XM_VMX128_INTRINSICS_
  4439. }
  4440. //------------------------------------------------------------------------------
  4441. XMFINLINE VOID XMStoreByte4
  4442. (
  4443. XMBYTE4* pDestination,
  4444. FXMVECTOR V
  4445. )
  4446. {
  4447. #if defined(_XM_NO_INTRINSICS_)
  4448. XMVECTOR N;
  4449. static CONST XMVECTOR Min = {-127.0f, -127.0f, -127.0f, -127.0f};
  4450. static CONST XMVECTOR Max = {127.0f, 127.0f, 127.0f, 127.0f};
  4451. XMASSERT(pDestination);
  4452. N = XMVectorClamp(V, Min, Max);
  4453. N = XMVectorRound(N);
  4454. pDestination->x = (CHAR)N.vector4_f32[0];
  4455. pDestination->y = (CHAR)N.vector4_f32[1];
  4456. pDestination->z = (CHAR)N.vector4_f32[2];
  4457. pDestination->w = (CHAR)N.vector4_f32[3];
  4458. #elif defined(_XM_SSE_INTRINSICS_)
  4459. XMASSERT(pDestination);
  4460. static const XMVECTORF32 MinByte4 = {-127.0f,-127.0f,-127.0f,-127.0f};
  4461. static const XMVECTORF32 MaxByte4 = { 127.0f, 127.0f, 127.0f, 127.0f};
  4462. static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f};
  4463. static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
  4464. // Clamp to bounds
  4465. XMVECTOR vResult = _mm_max_ps(V,MinByte4);
  4466. vResult = _mm_min_ps(vResult,MaxByte4);
  4467. // Scale by multiplication
  4468. vResult = _mm_mul_ps(vResult,ScaleByte4);
  4469. // Convert to int
  4470. __m128i vResulti = _mm_cvttps_epi32(vResult);
  4471. // Mask off any fraction
  4472. vResulti = _mm_and_si128(vResulti,MaskByte4);
  4473. // Do a horizontal or of 4 entries
  4474. __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
  4475. // x = x|z, y = y|w
  4476. vResulti = _mm_or_si128(vResulti,vResulti2);
  4477. // Move Z to the x position
  4478. vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
  4479. // i = x|y|z|w
  4480. vResulti = _mm_or_si128(vResulti,vResulti2);
  4481. _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
  4482. #else // _XM_VMX128_INTRINSICS_
  4483. #endif // _XM_VMX128_INTRINSICS_
  4484. }
  4485. //------------------------------------------------------------------------------
  4486. XMFINLINE VOID XMStoreUNibble4
  4487. (
  4488. XMUNIBBLE4* pDestination,
  4489. FXMVECTOR V
  4490. )
  4491. {
  4492. #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  4493. XMASSERT(pDestination);
  4494. static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
  4495. // Bounds check
  4496. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  4497. vResult = _mm_min_ps(vResult,Max);
  4498. // Convert to int with rounding
  4499. __m128i vInt = _mm_cvtps_epi32(vResult);
  4500. // No SSE operations will write to 16-bit values, so we have to extract them manually
  4501. USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
  4502. USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
  4503. USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
  4504. USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
  4505. pDestination->v = ((w & 0xF) << 12) |
  4506. ((z & 0xF) << 8) |
  4507. ((y & 0xF) << 4) |
  4508. ((x & 0xF));
  4509. #else
  4510. XMVECTOR N;
  4511. static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
  4512. XMASSERT(pDestination);
  4513. N = XMVectorClamp(V, XMVectorZero(), Max.v);
  4514. N = XMVectorRound(N);
  4515. pDestination->v = (((USHORT)N.vector4_f32[3] & 0xF) << 12) |
  4516. (((USHORT)N.vector4_f32[2] & 0xF) << 8) |
  4517. (((USHORT)N.vector4_f32[1] & 0xF) << 4) |
  4518. (((USHORT)N.vector4_f32[0] & 0xF));
  4519. #endif !_XM_SSE_INTRINSICS_
  4520. }
  4521. //------------------------------------------------------------------------------
  4522. XMFINLINE VOID XMStoreU555(
  4523. XMU555* pDestination,
  4524. FXMVECTOR V
  4525. )
  4526. {
  4527. #if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
  4528. XMASSERT(pDestination);
  4529. static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
  4530. // Bounds check
  4531. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  4532. vResult = _mm_min_ps(vResult,Max);
  4533. // Convert to int with rounding
  4534. __m128i vInt = _mm_cvtps_epi32(vResult);
  4535. // No SSE operations will write to 16-bit values, so we have to extract them manually
  4536. USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
  4537. USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
  4538. USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
  4539. USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
  4540. pDestination->v = ((w) ? 0x8000 : 0) |
  4541. ((z & 0x1F) << 10) |
  4542. ((y & 0x1F) << 5) |
  4543. ((x & 0x1F));
  4544. #else
  4545. XMVECTOR N;
  4546. static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
  4547. XMASSERT(pDestination);
  4548. N = XMVectorClamp(V, XMVectorZero(), Max.v);
  4549. N = XMVectorRound(N);
  4550. pDestination->v = ((N.vector4_f32[3] > 0.f) ? 0x8000 : 0) |
  4551. (((USHORT)N.vector4_f32[2] & 0x1F) << 10) |
  4552. (((USHORT)N.vector4_f32[1] & 0x1F) << 5) |
  4553. (((USHORT)N.vector4_f32[0] & 0x1F));
  4554. #endif !_XM_SSE_INTRINSICS_
  4555. }
  4556. //------------------------------------------------------------------------------
  4557. XMFINLINE VOID XMStoreColor
  4558. (
  4559. XMCOLOR* pDestination,
  4560. FXMVECTOR V
  4561. )
  4562. {
  4563. #if defined(_XM_NO_INTRINSICS_)
  4564. XMVECTOR N;
  4565. static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
  4566. XMASSERT(pDestination);
  4567. N = XMVectorSaturate(V);
  4568. N = XMVectorMultiply(N, Scale.v);
  4569. N = XMVectorRound(N);
  4570. pDestination->c = ((UINT)N.vector4_f32[3] << 24) |
  4571. ((UINT)N.vector4_f32[0] << 16) |
  4572. ((UINT)N.vector4_f32[1] << 8) |
  4573. ((UINT)N.vector4_f32[2]);
  4574. #elif defined(_XM_SSE_INTRINSICS_)
  4575. XMASSERT(pDestination);
  4576. static CONST XMVECTORF32 Scale = {255.0f,255.0f,255.0f,255.0f};
  4577. // Set <0 to 0
  4578. XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
  4579. // Set>1 to 1
  4580. vResult = _mm_min_ps(vResult,g_XMOne);
  4581. // Convert to 0-255
  4582. vResult = _mm_mul_ps(vResult,Scale);
  4583. // Shuffle RGBA to ARGB
  4584. vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
  4585. // Convert to int
  4586. __m128i vInt = _mm_cvtps_epi32(vResult);
  4587. // Mash to shorts
  4588. vInt = _mm_packs_epi32(vInt,vInt);
  4589. // Mash to bytes
  4590. vInt = _mm_packus_epi16(vInt,vInt);
  4591. // Store the color
  4592. _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]);
  4593. #else // _XM_VMX128_INTRINSICS_
  4594. #endif // _XM_VMX128_INTRINSICS_
  4595. }
  4596. //------------------------------------------------------------------------------
  4597. XMFINLINE VOID XMStoreFloat3x3
  4598. (
  4599. XMFLOAT3X3* pDestination,
  4600. CXMMATRIX M
  4601. )
  4602. {
  4603. #if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
  4604. XMStoreFloat3x3NC(pDestination, M);
  4605. #else // _XM_VMX128_INTRINSICS_
  4606. #endif // _XM_VMX128_INTRINSICS_
  4607. }
  4608. //------------------------------------------------------------------------------
  4609. XMFINLINE VOID XMStoreFloat3x3NC
  4610. (
  4611. XMFLOAT3X3* pDestination,
  4612. CXMMATRIX M
  4613. )
  4614. {
  4615. #if defined(_XM_NO_INTRINSICS_)
  4616. XMASSERT(pDestination);
  4617. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  4618. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  4619. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  4620. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  4621. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  4622. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  4623. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  4624. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  4625. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  4626. #elif defined(_XM_SSE_INTRINSICS_)
  4627. XMASSERT(pDestination);
  4628. XMVECTOR vTemp1 = M.r[0];
  4629. XMVECTOR vTemp2 = M.r[1];
  4630. XMVECTOR vTemp3 = M.r[2];
  4631. XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
  4632. vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
  4633. _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
  4634. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
  4635. _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
  4636. vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(2,2,2,2));
  4637. _mm_store_ss(&pDestination->m[2][2],vTemp3);
  4638. #else // _XM_VMX128_INTRINSICS_
  4639. #endif // _XM_VMX128_INTRINSICS_
  4640. }
  4641. //------------------------------------------------------------------------------
  4642. XMFINLINE VOID XMStoreFloat4x3
  4643. (
  4644. XMFLOAT4X3* pDestination,
  4645. CXMMATRIX M
  4646. )
  4647. {
  4648. #if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
  4649. XMStoreFloat4x3NC(pDestination, M);
  4650. #else // _XM_VMX128_INTRINSICS_
  4651. #endif // _XM_VMX128_INTRINSICS_
  4652. }
  4653. //------------------------------------------------------------------------------
  4654. XMFINLINE VOID XMStoreFloat4x3A
  4655. (
  4656. XMFLOAT4X3A* pDestination,
  4657. CXMMATRIX M
  4658. )
  4659. {
  4660. #if defined(_XM_NO_INTRINSICS_)
  4661. XMASSERT(pDestination);
  4662. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  4663. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  4664. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  4665. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  4666. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  4667. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  4668. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  4669. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  4670. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  4671. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  4672. pDestination->m[3][0] = M.r[3].vector4_f32[0];
  4673. pDestination->m[3][1] = M.r[3].vector4_f32[1];
  4674. pDestination->m[3][2] = M.r[3].vector4_f32[2];
  4675. #elif defined(_XM_SSE_INTRINSICS_)
  4676. XMASSERT(pDestination);
  4677. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  4678. // x1,y1,z1,w1
  4679. XMVECTOR vTemp1 = M.r[0];
  4680. // x2,y2,z2,w2
  4681. XMVECTOR vTemp2 = M.r[1];
  4682. // x3,y3,z3,w3
  4683. XMVECTOR vTemp3 = M.r[2];
  4684. // x4,y4,z4,w4
  4685. XMVECTOR vTemp4 = M.r[3];
  4686. // z1,z1,x2,y2
  4687. XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
  4688. // y2,z2,x3,y3 (Final)
  4689. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
  4690. // x1,y1,z1,x2 (Final)
  4691. vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
  4692. // z3,z3,x4,x4
  4693. vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
  4694. // z3,x4,y4,z4 (Final)
  4695. vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
  4696. // Store in 3 operations
  4697. _mm_store_ps(&pDestination->m[0][0],vTemp1);
  4698. _mm_store_ps(&pDestination->m[1][1],vTemp2);
  4699. _mm_store_ps(&pDestination->m[2][2],vTemp3);
  4700. #else // _XM_VMX128_INTRINSICS_
  4701. #endif // _XM_VMX128_INTRINSICS_
  4702. }
  4703. //------------------------------------------------------------------------------
  4704. XMFINLINE VOID XMStoreFloat4x3NC
  4705. (
  4706. XMFLOAT4X3* pDestination,
  4707. CXMMATRIX M
  4708. )
  4709. {
  4710. #if defined(_XM_NO_INTRINSICS_)
  4711. XMASSERT(pDestination);
  4712. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  4713. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  4714. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  4715. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  4716. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  4717. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  4718. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  4719. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  4720. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  4721. pDestination->m[3][0] = M.r[3].vector4_f32[0];
  4722. pDestination->m[3][1] = M.r[3].vector4_f32[1];
  4723. pDestination->m[3][2] = M.r[3].vector4_f32[2];
  4724. #elif defined(_XM_SSE_INTRINSICS_)
  4725. XMASSERT(pDestination);
  4726. XMVECTOR vTemp1 = M.r[0];
  4727. XMVECTOR vTemp2 = M.r[1];
  4728. XMVECTOR vTemp3 = M.r[2];
  4729. XMVECTOR vTemp4 = M.r[3];
  4730. XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
  4731. vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
  4732. vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
  4733. vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
  4734. vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
  4735. _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
  4736. _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
  4737. _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
  4738. #else // _XM_VMX128_INTRINSICS_
  4739. #endif // _XM_VMX128_INTRINSICS_
  4740. }
  4741. //------------------------------------------------------------------------------
  4742. XMFINLINE VOID XMStoreFloat4x4
  4743. (
  4744. XMFLOAT4X4* pDestination,
  4745. CXMMATRIX M
  4746. )
  4747. {
  4748. #if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  4749. XMStoreFloat4x4NC(pDestination, M);
  4750. #elif defined(_XM_SSE_INTRINSICS_)
  4751. XMASSERT(pDestination);
  4752. _mm_storeu_ps( &pDestination->_11, M.r[0] );
  4753. _mm_storeu_ps( &pDestination->_21, M.r[1] );
  4754. _mm_storeu_ps( &pDestination->_31, M.r[2] );
  4755. _mm_storeu_ps( &pDestination->_41, M.r[3] );
  4756. #else // _XM_VMX128_INTRINSICS_
  4757. #endif // _XM_VMX128_INTRINSICS_
  4758. }
  4759. //------------------------------------------------------------------------------
  4760. XMFINLINE VOID XMStoreFloat4x4A
  4761. (
  4762. XMFLOAT4X4A* pDestination,
  4763. CXMMATRIX M
  4764. )
  4765. {
  4766. #if defined(_XM_NO_INTRINSICS_)
  4767. XMASSERT(pDestination);
  4768. XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
  4769. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  4770. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  4771. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  4772. pDestination->m[0][3] = M.r[0].vector4_f32[3];
  4773. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  4774. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  4775. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  4776. pDestination->m[1][3] = M.r[1].vector4_f32[3];
  4777. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  4778. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  4779. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  4780. pDestination->m[2][3] = M.r[2].vector4_f32[3];
  4781. pDestination->m[3][0] = M.r[3].vector4_f32[0];
  4782. pDestination->m[3][1] = M.r[3].vector4_f32[1];
  4783. pDestination->m[3][2] = M.r[3].vector4_f32[2];
  4784. pDestination->m[3][3] = M.r[3].vector4_f32[3];
  4785. #elif defined(_XM_SSE_INTRINSICS_)
  4786. XMASSERT(pDestination);
  4787. _mm_store_ps( &pDestination->_11, M.r[0] );
  4788. _mm_store_ps( &pDestination->_21, M.r[1] );
  4789. _mm_store_ps( &pDestination->_31, M.r[2] );
  4790. _mm_store_ps( &pDestination->_41, M.r[3] );
  4791. #else // _XM_VMX128_INTRINSICS_
  4792. #endif // _XM_VMX128_INTRINSICS_
  4793. }
  4794. //------------------------------------------------------------------------------
  4795. XMFINLINE VOID XMStoreFloat4x4NC
  4796. (
  4797. XMFLOAT4X4* pDestination,
  4798. CXMMATRIX M
  4799. )
  4800. {
  4801. #if defined(_XM_NO_INTRINSICS_)
  4802. XMASSERT(pDestination);
  4803. pDestination->m[0][0] = M.r[0].vector4_f32[0];
  4804. pDestination->m[0][1] = M.r[0].vector4_f32[1];
  4805. pDestination->m[0][2] = M.r[0].vector4_f32[2];
  4806. pDestination->m[0][3] = M.r[0].vector4_f32[3];
  4807. pDestination->m[1][0] = M.r[1].vector4_f32[0];
  4808. pDestination->m[1][1] = M.r[1].vector4_f32[1];
  4809. pDestination->m[1][2] = M.r[1].vector4_f32[2];
  4810. pDestination->m[1][3] = M.r[1].vector4_f32[3];
  4811. pDestination->m[2][0] = M.r[2].vector4_f32[0];
  4812. pDestination->m[2][1] = M.r[2].vector4_f32[1];
  4813. pDestination->m[2][2] = M.r[2].vector4_f32[2];
  4814. pDestination->m[2][3] = M.r[2].vector4_f32[3];
  4815. pDestination->m[3][0] = M.r[3].vector4_f32[0];
  4816. pDestination->m[3][1] = M.r[3].vector4_f32[1];
  4817. pDestination->m[3][2] = M.r[3].vector4_f32[2];
  4818. pDestination->m[3][3] = M.r[3].vector4_f32[3];
  4819. #elif defined(_XM_SSE_INTRINSICS_)
  4820. XMASSERT(pDestination);
  4821. _mm_storeu_ps(&pDestination->m[0][0],M.r[0]);
  4822. _mm_storeu_ps(&pDestination->m[1][0],M.r[1]);
  4823. _mm_storeu_ps(&pDestination->m[2][0],M.r[2]);
  4824. _mm_storeu_ps(&pDestination->m[3][0],M.r[3]);
  4825. #else // _XM_VMX128_INTRINSICS_
  4826. #endif // _XM_VMX128_INTRINSICS_
  4827. }
  4828. #endif // __XNAMATHCONVERT_INL__