Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

467 lines
17 KiB

  1. /**
  2. *** Copyright (C) 1985-1999 Intel Corporation. All rights reserved.
  3. ***
  4. *** The information and source code contained herein is the exclusive
  5. *** property of Intel Corporation and may not be disclosed, examined
  6. *** or reproduced in whole or in part without explicit written authorization
  7. *** from the company.
  8. ***
  9. **/
  10. /*
  11. * xmmintrin.h
  12. *
  13. * Principal header file for Streaming SIMD Extensions intrinsics
  14. *
  15. * The intrinsics package can be used in 2 ways, based whether or not
  16. * _MM_FUNCTIONALITY is defined; if it is, the C/x87 implementation
  17. * will be used (the "faux intrinsics").
  18. *
  19. *
  20. * Note that the m128 datatype provided using _MM2_FUNCTIONALITY mode is
  21. * implemented as struct, will not be 128b aligned, will be passed
  22. * via the stack, etc. MM_FUNCTIONALITY mode is not intended for
  23. * performance, just semantics.
  24. *
  25. */
  26. #ifndef _INCLUDED_MM2
  27. #define _INCLUDED_MM2
  28. /*
  29. * the m64 type is required for the integer Streaming SIMD Extensions intrinsics
  30. */
  31. #ifndef _MMINTRIN_H_INCLUDED
  32. #include <mmintrin.h>
  33. #endif
  34. #ifdef _MM2_FUNCTIONALITY
  35. /* support old notation */
  36. #ifndef _MM_FUNCTIONALITY
  37. #define _MM_FUNCTIONALITY
  38. #endif
  39. #endif
  40. #ifdef __ICL
  41. #ifdef _MM_FUNCTIONALITY
  42. #include "xmm_func.h"
  43. #else
  44. /* using real intrinsics */
  45. typedef long long __m128;
  46. #endif
  47. #else
  48. #if _MSC_VER >= 1300
  49. typedef struct __declspec(intrin_type) __declspec(align(16)) __m128 {
  50. float m128_f32[4];
  51. } __m128;
  52. #endif
  53. #ifndef _INC_MALLOC
  54. /* pick up _mm_malloc() and _mm_free() */
  55. #include <malloc.h>
  56. #endif
  57. #endif
  58. /*******************************************************/
  59. /* MACRO for shuffle parameter for _mm_shuffle_ps(). */
  60. /* Argument fp3 is a digit[0123] that represents the fp*/
  61. /* from argument "b" of mm_shuffle_ps that will be */
  62. /* placed in fp3 of result. fp2 is the same for fp2 in */
  63. /* result. fp1 is a digit[0123] that represents the fp */
  64. /* from argument "a" of mm_shuffle_ps that will be */
  65. /* places in fp1 of result. fp0 is the same for fp0 of */
  66. /* result */
  67. /*******************************************************/
  68. #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \
  69. ((fp1) << 2) | ((fp0)))
  70. /*******************************************************/
  71. /* MACRO for performing the transpose of a 4x4 matrix */
  72. /* of single precision floating point values. */
  73. /* Arguments row0, row1, row2, and row3 are __m128 */
  74. /* values whose elements form the corresponding rows */
  75. /* of a 4x4 matrix. The matrix transpose is returned */
  76. /* in arguments row0, row1, row2, and row3 where row0 */
  77. /* now holds column 0 of the original matrix, row1 now */
  78. /* holds column 1 of the original matrix, etc. */
  79. /*******************************************************/
  80. #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) { \
  81. __m128 tmp3, tmp2, tmp1, tmp0; \
  82. \
  83. tmp0 = _mm_shuffle_ps((row0), (row1), 0x44); \
  84. tmp2 = _mm_shuffle_ps((row0), (row1), 0xEE); \
  85. tmp1 = _mm_shuffle_ps((row2), (row3), 0x44); \
  86. tmp3 = _mm_shuffle_ps((row2), (row3), 0xEE); \
  87. \
  88. (row0) = _mm_shuffle_ps(tmp0, tmp1, 0x88); \
  89. (row1) = _mm_shuffle_ps(tmp0, tmp1, 0xDD); \
  90. (row2) = _mm_shuffle_ps(tmp2, tmp3, 0x88); \
  91. (row3) = _mm_shuffle_ps(tmp2, tmp3, 0xDD); \
  92. }
  93. /* constants for use with _mm_prefetch */
  94. #define _MM_HINT_T0 1
  95. #define _MM_HINT_T1 2
  96. #define _MM_HINT_T2 3
  97. #define _MM_HINT_NTA 0
  98. /* (this declspec not supported with 0.A or 0.B) */
  99. #define _MM_ALIGN16 __declspec(align(16))
  100. /* MACRO functions for setting and reading the MXCSR */
  101. #define _MM_EXCEPT_MASK 0x003f
  102. #define _MM_EXCEPT_INVALID 0x0001
  103. #define _MM_EXCEPT_DENORM 0x0002
  104. #define _MM_EXCEPT_DIV_ZERO 0x0004
  105. #define _MM_EXCEPT_OVERFLOW 0x0008
  106. #define _MM_EXCEPT_UNDERFLOW 0x0010
  107. #define _MM_EXCEPT_INEXACT 0x0020
  108. #define _MM_MASK_MASK 0x1f80
  109. #define _MM_MASK_INVALID 0x0080
  110. #define _MM_MASK_DENORM 0x0100
  111. #define _MM_MASK_DIV_ZERO 0x0200
  112. #define _MM_MASK_OVERFLOW 0x0400
  113. #define _MM_MASK_UNDERFLOW 0x0800
  114. #define _MM_MASK_INEXACT 0x1000
  115. #define _MM_ROUND_MASK 0x6000
  116. #define _MM_ROUND_NEAREST 0x0000
  117. #define _MM_ROUND_DOWN 0x2000
  118. #define _MM_ROUND_UP 0x4000
  119. #define _MM_ROUND_TOWARD_ZERO 0x6000
  120. #define _MM_FLUSH_ZERO_MASK 0x8000
  121. #define _MM_FLUSH_ZERO_ON 0x8000
  122. #define _MM_FLUSH_ZERO_OFF 0x0000
  123. #define _MM_SET_EXCEPTION_STATE(mask) \
  124. _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (mask))
  125. #define _MM_GET_EXCEPTION_STATE() \
  126. (_mm_getcsr() & _MM_EXCEPT_MASK)
  127. #define _MM_SET_EXCEPTION_MASK(mask) \
  128. _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (mask))
  129. #define _MM_GET_EXCEPTION_MASK() \
  130. (_mm_getcsr() & _MM_MASK_MASK)
  131. #define _MM_SET_ROUNDING_MODE(mode) \
  132. _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (mode))
  133. #define _MM_GET_ROUNDING_MODE() \
  134. (_mm_getcsr() & _MM_ROUND_MASK)
  135. #define _MM_SET_FLUSH_ZERO_MODE(mode) \
  136. _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (mode))
  137. #define _MM_GET_FLUSH_ZERO_MODE(mode) \
  138. (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
  139. /*****************************************************/
  140. /* INTRINSICS FUNCTION PROTOTYPES START HERE */
  141. /*****************************************************/
  142. #if defined __cplusplus
  143. extern "C" { /* Begin "C" */
  144. /* Intrinsics use C name-mangling. */
  145. #endif /* __cplusplus */
  146. /*
  147. * FP, arithmetic
  148. */
  149. extern __m128 _mm_add_ss(__m128 a, __m128 b);
  150. extern __m128 _mm_add_ps(__m128 a, __m128 b);
  151. extern __m128 _mm_sub_ss(__m128 a, __m128 b);
  152. extern __m128 _mm_sub_ps(__m128 a, __m128 b);
  153. extern __m128 _mm_mul_ss(__m128 a, __m128 b);
  154. extern __m128 _mm_mul_ps(__m128 a, __m128 b);
  155. extern __m128 _mm_div_ss(__m128 a, __m128 b);
  156. extern __m128 _mm_div_ps(__m128 a, __m128 b);
  157. extern __m128 _mm_sqrt_ss(__m128 a);
  158. extern __m128 _mm_sqrt_ps(__m128 a);
  159. extern __m128 _mm_rcp_ss(__m128 a);
  160. extern __m128 _mm_rcp_ps(__m128 a);
  161. extern __m128 _mm_rsqrt_ss(__m128 a);
  162. extern __m128 _mm_rsqrt_ps(__m128 a);
  163. extern __m128 _mm_min_ss(__m128 a, __m128 b);
  164. extern __m128 _mm_min_ps(__m128 a, __m128 b);
  165. extern __m128 _mm_max_ss(__m128 a, __m128 b);
  166. extern __m128 _mm_max_ps(__m128 a, __m128 b);
  167. /*
  168. * FP, logical
  169. */
  170. extern __m128 _mm_and_ps(__m128 a, __m128 b);
  171. extern __m128 _mm_andnot_ps(__m128 a, __m128 b);
  172. extern __m128 _mm_or_ps(__m128 a, __m128 b);
  173. extern __m128 _mm_xor_ps(__m128 a, __m128 b);
  174. /*
  175. * FP, comparison
  176. */
  177. extern __m128 _mm_cmpeq_ss(__m128 a, __m128 b);
  178. extern __m128 _mm_cmpeq_ps(__m128 a, __m128 b);
  179. extern __m128 _mm_cmplt_ss(__m128 a, __m128 b);
  180. extern __m128 _mm_cmplt_ps(__m128 a, __m128 b);
  181. extern __m128 _mm_cmple_ss(__m128 a, __m128 b);
  182. extern __m128 _mm_cmple_ps(__m128 a, __m128 b);
  183. extern __m128 _mm_cmpgt_ss(__m128 a, __m128 b);
  184. extern __m128 _mm_cmpgt_ps(__m128 a, __m128 b);
  185. extern __m128 _mm_cmpge_ss(__m128 a, __m128 b);
  186. extern __m128 _mm_cmpge_ps(__m128 a, __m128 b);
  187. extern __m128 _mm_cmpneq_ss(__m128 a, __m128 b);
  188. extern __m128 _mm_cmpneq_ps(__m128 a, __m128 b);
  189. extern __m128 _mm_cmpnlt_ss(__m128 a, __m128 b);
  190. extern __m128 _mm_cmpnlt_ps(__m128 a, __m128 b);
  191. extern __m128 _mm_cmpnle_ss(__m128 a, __m128 b);
  192. extern __m128 _mm_cmpnle_ps(__m128 a, __m128 b);
  193. extern __m128 _mm_cmpngt_ss(__m128 a, __m128 b);
  194. extern __m128 _mm_cmpngt_ps(__m128 a, __m128 b);
  195. extern __m128 _mm_cmpnge_ss(__m128 a, __m128 b);
  196. extern __m128 _mm_cmpnge_ps(__m128 a, __m128 b);
  197. extern __m128 _mm_cmpord_ss(__m128 a, __m128 b);
  198. extern __m128 _mm_cmpord_ps(__m128 a, __m128 b);
  199. extern __m128 _mm_cmpunord_ss(__m128 a, __m128 b);
  200. extern __m128 _mm_cmpunord_ps(__m128 a, __m128 b);
  201. extern int _mm_comieq_ss(__m128 a, __m128 b);
  202. extern int _mm_comilt_ss(__m128 a, __m128 b);
  203. extern int _mm_comile_ss(__m128 a, __m128 b);
  204. extern int _mm_comigt_ss(__m128 a, __m128 b);
  205. extern int _mm_comige_ss(__m128 a, __m128 b);
  206. extern int _mm_comineq_ss(__m128 a, __m128 b);
  207. extern int _mm_ucomieq_ss(__m128 a, __m128 b);
  208. extern int _mm_ucomilt_ss(__m128 a, __m128 b);
  209. extern int _mm_ucomile_ss(__m128 a, __m128 b);
  210. extern int _mm_ucomigt_ss(__m128 a, __m128 b);
  211. extern int _mm_ucomige_ss(__m128 a, __m128 b);
  212. extern int _mm_ucomineq_ss(__m128 a, __m128 b);
  213. /*
  214. * FP, conversions
  215. */
  216. extern int _mm_cvt_ss2si(__m128 a);
  217. extern __m64 _mm_cvt_ps2pi(__m128 a);
  218. extern int _mm_cvtt_ss2si(__m128 a);
  219. extern __m64 _mm_cvtt_ps2pi(__m128 a);
  220. extern __m128 _mm_cvt_si2ss(__m128, int);
  221. extern __m128 _mm_cvt_pi2ps(__m128, __m64);
  222. /*
  223. * FP, misc
  224. */
  225. extern __m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8);
  226. extern __m128 _mm_unpackhi_ps(__m128 a, __m128 b);
  227. extern __m128 _mm_unpacklo_ps(__m128 a, __m128 b);
  228. extern __m128 _mm_loadh_pi(__m128, __m64 const*);
  229. extern __m128 _mm_movehl_ps(__m128, __m128);
  230. extern __m128 _mm_movelh_ps(__m128, __m128);
  231. extern void _mm_storeh_pi(__m64 *, __m128);
  232. extern __m128 _mm_loadl_pi(__m128, __m64 const*);
  233. extern void _mm_storel_pi(__m64 *, __m128);
  234. extern int _mm_movemask_ps(__m128 a);
  235. /*
  236. * Integer extensions
  237. */
  238. extern int _m_pextrw(__m64, int);
  239. extern __m64 _m_pinsrw(__m64, int, int);
  240. extern __m64 _m_pmaxsw(__m64, __m64);
  241. extern __m64 _m_pmaxub(__m64, __m64);
  242. extern __m64 _m_pminsw(__m64, __m64);
  243. extern __m64 _m_pminub(__m64, __m64);
  244. extern int _m_pmovmskb(__m64);
  245. extern __m64 _m_pmulhuw(__m64, __m64);
  246. extern __m64 _m_pshufw(__m64, int);
  247. extern void _m_maskmovq(__m64, __m64, char *);
  248. extern __m64 _m_pavgb(__m64, __m64);
  249. extern __m64 _m_pavgw(__m64, __m64);
  250. extern __m64 _m_psadbw(__m64, __m64);
  251. /*
  252. * memory & initialization
  253. */
  254. extern __m128 _mm_set_ss(float a);
  255. extern __m128 _mm_set_ps1(float a);
  256. extern __m128 _mm_set_ps(float a, float b, float c, float d);
  257. extern __m128 _mm_setr_ps(float a, float b, float c, float d);
  258. extern __m128 _mm_setzero_ps(void);
  259. extern __m128 _mm_load_ss(float const*a);
  260. extern __m128 _mm_load_ps1(float const*a);
  261. extern __m128 _mm_load_ps(float const*a);
  262. extern __m128 _mm_loadr_ps(float const*a);
  263. extern __m128 _mm_loadu_ps(float const*a);
  264. extern void _mm_store_ss(float *v, __m128 a);
  265. extern void _mm_store_ps1(float *v, __m128 a);
  266. extern void _mm_store_ps(float *v, __m128 a);
  267. extern void _mm_storer_ps(float *v, __m128 a);
  268. extern void _mm_storeu_ps(float *v, __m128 a);
  269. extern void _mm_prefetch(char const*a, int sel);
  270. extern void _mm_stream_pi(__m64 *, __m64);
  271. extern void _mm_stream_ps(float *, __m128);
  272. extern __m128 _mm_move_ss(__m128 a, __m128 b);
  273. extern void _mm_sfence(void);
  274. extern unsigned int _mm_getcsr(void);
  275. extern void _mm_setcsr(unsigned int);
  276. #ifdef __ICL
  277. extern void* __cdecl _mm_malloc(int siz, int al);
  278. extern void __cdecl _mm_free(void *p);
  279. #endif
  280. /* Alternate intrinsic names definition */
  281. #define _mm_cvtss_si32 _mm_cvt_ss2si
  282. #define _mm_cvtps_pi32 _mm_cvt_ps2pi
  283. #define _mm_cvttss_si32 _mm_cvtt_ss2si
  284. #define _mm_cvttps_pi32 _mm_cvtt_ps2pi
  285. #define _mm_cvtsi32_ss _mm_cvt_si2ss
  286. #define _mm_cvtpi32_ps _mm_cvt_pi2ps
  287. #define _mm_extract_pi16 _m_pextrw
  288. #define _mm_insert_pi16 _m_pinsrw
  289. #define _mm_max_pi16 _m_pmaxsw
  290. #define _mm_max_pu8 _m_pmaxub
  291. #define _mm_min_pi16 _m_pminsw
  292. #define _mm_min_pu8 _m_pminub
  293. #define _mm_movemask_pi8 _m_pmovmskb
  294. #define _mm_mulhi_pu16 _m_pmulhuw
  295. #define _mm_shuffle_pi16 _m_pshufw
  296. #define _mm_maskmove_si64 _m_maskmovq
  297. #define _mm_avg_pu8 _m_pavgb
  298. #define _mm_avg_pu16 _m_pavgw
  299. #define _mm_sad_pu8 _m_psadbw
  300. #define _mm_set1_ps _mm_set_ps1
  301. #define _mm_load1_ps _mm_load_ps1
  302. #define _mm_store1_ps _mm_store_ps1
  303. /******************************************************/
  304. /* UTILITY INTRINSICS FUNCTION DEFINITIONS START HERE */
  305. /******************************************************/
  306. /*********************************************************/
  307. /* NAME : _mm_cvtpi16_ps */
  308. /* DESCRIPTION : Convert 4 16-bit signed integer values */
  309. /* to 4 single-precision float values */
  310. /* IN : __m64 a */
  311. /* OUT : none */
  312. /* RETURN : __m128 : (float)a */
  313. /*********************************************************/
  314. __inline __m128 _mm_cvtpi16_ps(__m64 a)
  315. {
  316. __m128 tmp;
  317. __m64 ext_val = _mm_cmpgt_pi16(_mm_setzero_si64(), a);
  318. tmp = _mm_cvtpi32_ps(_mm_setzero_ps(), _mm_unpackhi_pi16(a, ext_val));
  319. return(_mm_cvtpi32_ps(_mm_movelh_ps(tmp, tmp),
  320. _mm_unpacklo_pi16(a, ext_val)));
  321. }
  322. /***********************************************************/
  323. /* NAME : _mm_cvtpu16_ps */
  324. /* DESCRIPTION : Convert 4 16-bit unsigned integer values */
  325. /* to 4 single-precision float values */
  326. /* IN : __m64 a */
  327. /* OUT : none */
  328. /* RETURN : __m128 : (float)a */
  329. /***********************************************************/
  330. __inline __m128 _mm_cvtpu16_ps(__m64 a)
  331. {
  332. __m128 tmp;
  333. __m64 ext_val = _mm_setzero_si64();
  334. tmp = _mm_cvtpi32_ps(_mm_setzero_ps(), _mm_unpackhi_pi16(a, ext_val));
  335. return(_mm_cvtpi32_ps(_mm_movelh_ps(tmp, tmp),
  336. _mm_unpacklo_pi16(a, ext_val)));
  337. }
  338. /******************************************************/
  339. /* NAME : _mm_cvtps_pi16 */
  340. /* DESCRIPTION : Convert 4 single-precision float */
  341. /* values to 4 16-bit integer values */
  342. /* IN : __m128 a */
  343. /* OUT : none */
  344. /* RETURN : __m64 : (short)a */
  345. /******************************************************/
  346. __inline __m64 _mm_cvtps_pi16(__m128 a)
  347. {
  348. return _mm_packs_pi32(_mm_cvtps_pi32(a),
  349. _mm_cvtps_pi32(_mm_movehl_ps(a, a)));
  350. }
  351. /******************************************************/
  352. /* NAME : _mm_cvtpi8_ps */
  353. /* DESCRIPTION : Convert 4 8-bit integer values to 4 */
  354. /* single-precision float values */
  355. /* IN : __m64 a */
  356. /* OUT : none */
  357. /* RETURN : __m128 : (float)a */
  358. /******************************************************/
  359. __inline __m128 _mm_cvtpi8_ps(__m64 a)
  360. {
  361. __m64 ext_val = _mm_cmpgt_pi8(_mm_setzero_si64(), a);
  362. return _mm_cvtpi16_ps(_mm_unpacklo_pi8(a, ext_val));
  363. }
  364. /******************************************************/
  365. /* NAME : _mm_cvtpu8_ps */
  366. /* DESCRIPTION : Convert 4 8-bit unsigned integer */
  367. /* values to 4 single-precision float */
  368. /* values */
  369. /* IN : __m64 a */
  370. /* OUT : none */
  371. /* RETURN : __m128 : (float)a */
  372. /******************************************************/
  373. __inline __m128 _mm_cvtpu8_ps(__m64 a)
  374. {
  375. return _mm_cvtpu16_ps(_mm_unpacklo_pi8(a, _mm_setzero_si64()));
  376. }
  377. /******************************************************/
  378. /* NAME : _mm_cvtps_pi8 */
  379. /* DESCRIPTION : Convert 4 single-precision float */
  380. /* values to 4 8-bit integer values */
  381. /* IN : __m128 a */
  382. /* OUT : none */
  383. /* RETURN : __m64 : (char)a */
  384. /******************************************************/
  385. __inline __m64 _mm_cvtps_pi8(__m128 a)
  386. {
  387. return _mm_packs_pi16(_mm_cvtps_pi16(a), _mm_setzero_si64());
  388. }
  389. /******************************************************/
  390. /* NAME : _mm_cvtpi32x2_ps */
  391. /* DESCRIPTION : Convert 4 32-bit integer values */
  392. /* to 4 single-precision float values */
  393. /* IN : __m64 a : operand 1 */
  394. /* __m64 b : operand 2 */
  395. /* OUT : none */
  396. /* RETURN : __m128 : (float)a,(float)b */
  397. /******************************************************/
  398. __inline __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
  399. {
  400. return _mm_movelh_ps(_mm_cvt_pi2ps(_mm_setzero_ps(), a),
  401. _mm_cvt_pi2ps(_mm_setzero_ps(), b));
  402. }
  403. #if defined __cplusplus
  404. }; /* End "C" */
  405. #endif /* __cplusplus */
  406. #endif /* _INCLUDED_MM2 */