Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

467 lines
16 KiB

  1. /**
  2. *** Copyright (C) 1985-1999 Intel Corporation. All rights reserved.
  3. ***
  4. *** The information and source code contained herein is the exclusive
  5. *** property of Intel Corporation and may not be disclosed, examined
  6. *** or reproduced in whole or in part without explicit written authorization
  7. *** from the company.
  8. ***
  9. **/
  10. /*
  11. * Definition of a C++ class interface to Streaming SIMD Extension intrinsics.
  12. *
  13. *
  14. * File name : fvec.h Fvec class definitions
  15. *
  16. * Concept: A C++ abstraction of Streaming SIMD Extensions designed to improve
  17. *
  18. * programmer productivity. Speed and accuracy are sacrificed for utility.
  19. *
  20. * Facilitates an easy transition to compiler intrinsics
  21. *
  22. * or assembly language.
  23. *
  24. * F32vec4: 4 packed single precision
  25. * 32-bit floating point numbers
  26. */
  27. #ifndef FVEC_H_INCLUDED
  28. #define FVEC_H_INCLUDED
  29. #if !defined __cplusplus
  30. #error ERROR: This file is only supported in C++ compilations!
  31. #endif /* !__cplusplus */
  32. #include <xmmintrin.h> /* Streaming SIMD Extensions Intrinsics include file */
  33. #include <assert.h>
  34. #include <ivec.h>
  35. /* Define _ENABLE_VEC_DEBUG to enable std::ostream inserters for debug output */
  36. #if defined(_ENABLE_VEC_DEBUG)
  37. #include <iostream>
  38. #endif
  39. #pragma pack(push,16) /* Must ensure class & union 16-B aligned */
  40. /* If using MSVC5.0, explicit keyword should be used */
  41. #if (_MSC_VER >= 1100)
  42. #define EXPLICIT explicit
  43. #else
  44. #if (__ICL)
  45. #define EXPLICIT __explicit /* If MSVC4.x & ICL, use __explicit */
  46. #else
  47. #define EXPLICIT /* nothing */
  48. #pragma message( "explicit keyword not recognized")
  49. #endif
  50. #endif
  51. class F32vec4
  52. {
  53. protected:
  54. __m128 vec;
  55. public:
  56. /* Constructors: __m128, 4 floats, 1 float */
  57. F32vec4() {}
  58. /* initialize 4 SP FP with __m128 data type */
  59. F32vec4(__m128 m) { vec = m;}
  60. /* initialize 4 SP FPs with 4 floats */
  61. F32vec4(float f3, float f2, float f1, float f0) { vec= _mm_set_ps(f3,f2,f1,f0); }
  62. /* Explicitly initialize each of 4 SP FPs with same float */
  63. EXPLICIT F32vec4(float f) { vec = _mm_set_ps1(f); }
  64. /* Explicitly initialize each of 4 SP FPs with same double */
  65. EXPLICIT F32vec4(double d) { vec = _mm_set_ps1((float) d); }
  66. /* Assignment operations */
  67. F32vec4& operator =(float f) { vec = _mm_set_ps1(f); return *this; }
  68. F32vec4& operator =(double d) { vec = _mm_set_ps1((float) d); return *this; }
  69. /* Conversion functions */
  70. operator __m128() const { return vec; } /* Convert to __m128 */
  71. /* Logical Operators */
  72. friend F32vec4 operator &(const F32vec4 &a, const F32vec4 &b) { return _mm_and_ps(a,b); }
  73. friend F32vec4 operator |(const F32vec4 &a, const F32vec4 &b) { return _mm_or_ps(a,b); }
  74. friend F32vec4 operator ^(const F32vec4 &a, const F32vec4 &b) { return _mm_xor_ps(a,b); }
  75. /* Arithmetic Operators */
  76. friend F32vec4 operator +(const F32vec4 &a, const F32vec4 &b) { return _mm_add_ps(a,b); }
  77. friend F32vec4 operator -(const F32vec4 &a, const F32vec4 &b) { return _mm_sub_ps(a,b); }
  78. friend F32vec4 operator *(const F32vec4 &a, const F32vec4 &b) { return _mm_mul_ps(a,b); }
  79. friend F32vec4 operator /(const F32vec4 &a, const F32vec4 &b) { return _mm_div_ps(a,b); }
  80. F32vec4& operator =(const F32vec4 &a) { vec = a.vec; return *this; }
  81. F32vec4& operator =(const __m128 &avec) { vec = avec; return *this; }
  82. F32vec4& operator +=(F32vec4 &a) { return *this = _mm_add_ps(vec,a); }
  83. F32vec4& operator -=(F32vec4 &a) { return *this = _mm_sub_ps(vec,a); }
  84. F32vec4& operator *=(F32vec4 &a) { return *this = _mm_mul_ps(vec,a); }
  85. F32vec4& operator /=(F32vec4 &a) { return *this = _mm_div_ps(vec,a); }
  86. F32vec4& operator &=(F32vec4 &a) { return *this = _mm_and_ps(vec,a); }
  87. F32vec4& operator |=(F32vec4 &a) { return *this = _mm_or_ps(vec,a); }
  88. F32vec4& operator ^=(F32vec4 &a) { return *this = _mm_xor_ps(vec,a); }
  89. /* Horizontal Add */
  90. friend float add_horizontal(F32vec4 &a)
  91. {
  92. F32vec4 ftemp = _mm_add_ss(a,_mm_add_ss(_mm_shuffle_ps(a, a, 1),_mm_add_ss(_mm_shuffle_ps(a, a, 2),_mm_shuffle_ps(a, a, 3))));
  93. return ftemp[0];
  94. }
  95. /* Square Root */
  96. friend F32vec4 sqrt(const F32vec4 &a) { return _mm_sqrt_ps(a); }
  97. /* Reciprocal */
  98. friend F32vec4 rcp(const F32vec4 &a) { return _mm_rcp_ps(a); }
  99. /* Reciprocal Square Root */
  100. friend F32vec4 rsqrt(const F32vec4 &a) { return _mm_rsqrt_ps(a); }
  101. /* NewtonRaphson Reciprocal
  102. [2 * rcpps(x) - (x * rcpps(x) * rcpps(x))] */
  103. friend F32vec4 rcp_nr(const F32vec4 &a)
  104. {
  105. F32vec4 Ra0 = _mm_rcp_ps(a);
  106. return _mm_sub_ps(_mm_add_ps(Ra0, Ra0), _mm_mul_ps(_mm_mul_ps(Ra0, a), Ra0));
  107. }
  108. /* NewtonRaphson Reciprocal Square Root
  109. 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) */
  110. friend F32vec4 rsqrt_nr(const F32vec4 &a)
  111. {
  112. static const F32vec4 fvecf0pt5(0.5f);
  113. static const F32vec4 fvecf3pt0(3.0f);
  114. F32vec4 Ra0 = _mm_rsqrt_ps(a);
  115. return (fvecf0pt5 * Ra0) * (fvecf3pt0 - (a * Ra0) * Ra0);
  116. }
  117. /* Compares: Mask is returned */
  118. /* Macros expand to all compare intrinsics. Example:
  119. friend F32vec4 cmpeq(const F32vec4 &a, const F32vec4 &b)
  120. { return _mm_cmpeq_ps(a,b);} */
  121. #define Fvec32s4_COMP(op) \
  122. friend F32vec4 cmp##op (const F32vec4 &a, const F32vec4 &b) { return _mm_cmp##op##_ps(a,b); }
  123. Fvec32s4_COMP(eq) // expanded to cmpeq(a,b)
  124. Fvec32s4_COMP(lt) // expanded to cmplt(a,b)
  125. Fvec32s4_COMP(le) // expanded to cmple(a,b)
  126. Fvec32s4_COMP(gt) // expanded to cmpgt(a,b)
  127. Fvec32s4_COMP(ge) // expanded to cmpge(a,b)
  128. Fvec32s4_COMP(neq) // expanded to cmpneq(a,b)
  129. Fvec32s4_COMP(nlt) // expanded to cmpnlt(a,b)
  130. Fvec32s4_COMP(nle) // expanded to cmpnle(a,b)
  131. Fvec32s4_COMP(ngt) // expanded to cmpngt(a,b)
  132. Fvec32s4_COMP(nge) // expanded to cmpnge(a,b)
  133. #undef Fvec32s4_COMP
  134. /* Min and Max */
  135. friend F32vec4 simd_min(const F32vec4 &a, const F32vec4 &b) { return _mm_min_ps(a,b); }
  136. friend F32vec4 simd_max(const F32vec4 &a, const F32vec4 &b) { return _mm_max_ps(a,b); }
  137. /* Debug Features */
  138. #if defined(_ENABLE_VEC_DEBUG)
  139. /* Output */
  140. friend std::ostream & operator<<(std::ostream & os, const F32vec4 &a)
  141. {
  142. /* To use: cout << "Elements of F32vec4 fvec are: " << fvec; */
  143. float *fp = (float*)&a;
  144. os << "[3]:" << *(fp+3)
  145. << " [2]:" << *(fp+2)
  146. << " [1]:" << *(fp+1)
  147. << " [0]:" << *fp;
  148. return os;
  149. }
  150. #endif
  151. /* Element Access Only, no modifications to elements*/
  152. const float& operator[](int i) const
  153. {
  154. /* Assert enabled only during debug /DDEBUG */
  155. assert((0 <= i) && (i <= 3)); /* User should only access elements 0-3 */
  156. float *fp = (float*)&vec;
  157. return *(fp+i);
  158. }
  159. /* Element Access and Modification*/
  160. float& operator[](int i)
  161. {
  162. /* Assert enabled only during debug /DDEBUG */
  163. assert((0 <= i) && (i <= 3)); /* User should only access elements 0-3 */
  164. float *fp = (float*)&vec;
  165. return *(fp+i);
  166. }
  167. };
  168. /* Miscellaneous */
  169. /* Interleave low order data elements of a and b into destination */
  170. inline F32vec4 unpack_low(const F32vec4 &a, const F32vec4 &b)
  171. { return _mm_unpacklo_ps(a, b); }
  172. /* Interleave high order data elements of a and b into target */
  173. inline F32vec4 unpack_high(const F32vec4 &a, const F32vec4 &b)
  174. { return _mm_unpackhi_ps(a, b); }
  175. /* Move Mask to Integer returns 4 bit mask formed of most significant bits of a */
  176. inline int move_mask(const F32vec4 &a)
  177. { return _mm_movemask_ps(a);}
  178. /* Data Motion Functions */
  179. /* Load Unaligned loadu_ps: Unaligned */
  180. inline void loadu(F32vec4 &a, float *p)
  181. { a = _mm_loadu_ps(p); }
  182. /* Store Temporal storeu_ps: Unaligned */
  183. inline void storeu(float *p, const F32vec4 &a)
  184. { _mm_storeu_ps(p, a); }
  185. /* Cacheability Support */
  186. /* Non-Temporal Store */
  187. inline void store_nta(float *p, F32vec4 &a)
  188. { _mm_stream_ps(p,a);}
  189. /* Conditional Selects:*/
  190. /*(a OP b)? c : d; where OP is any compare operator
  191. Macros expand to conditional selects which use all compare intrinsics.
  192. Example:
  193. friend F32vec4 select_eq(const F32vec4 &a, const F32vec4 &b, const F32vec4 &c, const F32vec4 &d)
  194. {
  195. F32vec4 mask = _mm_cmpeq_ps(a,b);
  196. return( (mask & c) | F32vec4((_mm_andnot_ps(mask,d))));
  197. }
  198. */
  199. #define Fvec32s4_SELECT(op) \
  200. inline F32vec4 select_##op (const F32vec4 &a, const F32vec4 &b, const F32vec4 &c, const F32vec4 &d) \
  201. { \
  202. F32vec4 mask = _mm_cmp##op##_ps(a,b); \
  203. return( (mask & c) | F32vec4((_mm_andnot_ps(mask,d)))); \
  204. }
  205. Fvec32s4_SELECT(eq) // generates select_eq(a,b)
  206. Fvec32s4_SELECT(lt) // generates select_lt(a,b)
  207. Fvec32s4_SELECT(le) // generates select_le(a,b)
  208. Fvec32s4_SELECT(gt) // generates select_gt(a,b)
  209. Fvec32s4_SELECT(ge) // generates select_ge(a,b)
  210. Fvec32s4_SELECT(neq) // generates select_neq(a,b)
  211. Fvec32s4_SELECT(nlt) // generates select_nlt(a,b)
  212. Fvec32s4_SELECT(nle) // generates select_nle(a,b)
  213. Fvec32s4_SELECT(ngt) // generates select_ngt(a,b)
  214. Fvec32s4_SELECT(nge) // generates select_nge(a,b)
  215. #undef Fvec32s4_SELECT
  216. /* Streaming SIMD Extensions Integer Intrinsics */
  217. /* Max and Min */
  218. inline Is16vec4 simd_max(const Is16vec4 &a, const Is16vec4 &b) { return _m_pmaxsw(a,b);}
  219. inline Is16vec4 simd_min(const Is16vec4 &a, const Is16vec4 &b) { return _m_pminsw(a,b);}
  220. inline Iu8vec8 simd_max(const Iu8vec8 &a, const Iu8vec8 &b) { return _m_pmaxub(a,b);}
  221. inline Iu8vec8 simd_min(const Iu8vec8 &a, const Iu8vec8 &b) { return _m_pminub(a,b);}
  222. /* Average */
  223. inline Iu16vec4 simd_avg(const Iu16vec4 &a, const Iu16vec4 &b) { return _m_pavgw(a,b); }
  224. inline Iu8vec8 simd_avg(const Iu8vec8 &a, const Iu8vec8 &b) { return _m_pavgb(a,b); }
  225. /* Move ByteMask To Int: returns mask formed from most sig bits of each vec of a */
  226. inline int move_mask(const I8vec8 &a) { return _m_pmovmskb(a);}
  227. /* Packed Multiply High Unsigned */
  228. inline Iu16vec4 mul_high(const Iu16vec4 &a, const Iu16vec4 &b) { return _m_pmulhuw(a,b); }
  229. /* Byte Mask Write: Write bytes if most significant bit in each corresponding byte is set */
  230. inline void mask_move(const I8vec8 &a, const I8vec8 &b, char *addr) { _m_maskmovq(a, b, addr); }
  231. /* Data Motion: Store Non Temporal */
  232. inline void store_nta(__m64 *p, M64 &a) { _mm_stream_pi(p,a); }
  233. /* Conversions between ivec <-> fvec */
  234. /* Convert first element of F32vec4 to int with truncation */
  235. inline int F32vec4ToInt(const F32vec4 &a)
  236. {
  237. return _mm_cvtt_ss2si(a);
  238. }
  239. /* Convert two lower SP FP values of a to Is32vec2 with truncation */
  240. inline Is32vec2 F32vec4ToIs32vec2 (const F32vec4 &a)
  241. {
  242. __m64 result;
  243. result = _mm_cvtt_ps2pi(a);
  244. return Is32vec2(result);
  245. }
  246. /* Convert the 32-bit int i to an SP FP value; the upper three SP FP values are passed through from a. */
  247. inline F32vec4 IntToF32vec4(const F32vec4 &a, int i)
  248. {
  249. __m128 result;
  250. result = _mm_cvt_si2ss(a,i);
  251. return F32vec4(result);
  252. }
  253. /* Convert the two 32-bit integer values in b to two SP FP values; the upper two SP FP values are passed from a. */
  254. inline F32vec4 Is32vec2ToF32vec4(const F32vec4 &a, const Is32vec2 &b)
  255. {
  256. __m128 result;
  257. result = _mm_cvt_pi2ps(a,b);
  258. return F32vec4(result);
  259. }
  260. class F32vec1
  261. {
  262. protected:
  263. __m128 vec;
  264. public:
  265. /* Constructors: 1 float */
  266. F32vec1() {}
  267. F32vec1(int i) { vec = _mm_cvt_si2ss(vec,i);};
  268. /* Initialize each of 4 SP FPs with same float */
  269. EXPLICIT F32vec1(float f) { vec = _mm_set_ss(f); }
  270. /* Initialize each of 4 SP FPs with same float */
  271. EXPLICIT F32vec1(double d) { vec = _mm_set_ss((float) d); }
  272. /* initialize with __m128 data type */
  273. F32vec1(__m128 m) { vec = m; }
  274. /* Conversion functions */
  275. operator __m128() const { return vec; } /* Convert to float */
  276. /* Logical Operators */
  277. friend F32vec1 operator &(const F32vec1 &a, const F32vec1 &b) { return _mm_and_ps(a,b); }
  278. friend F32vec1 operator |(const F32vec1 &a, const F32vec1 &b) { return _mm_or_ps(a,b); }
  279. friend F32vec1 operator ^(const F32vec1 &a, const F32vec1 &b) { return _mm_xor_ps(a,b); }
  280. /* Arithmetic Operators */
  281. friend F32vec1 operator +(const F32vec1 &a, const F32vec1 &b) { return _mm_add_ss(a,b); }
  282. friend F32vec1 operator -(const F32vec1 &a, const F32vec1 &b) { return _mm_sub_ss(a,b); }
  283. friend F32vec1 operator *(const F32vec1 &a, const F32vec1 &b) { return _mm_mul_ss(a,b); }
  284. friend F32vec1 operator /(const F32vec1 &a, const F32vec1 &b) { return _mm_div_ss(a,b); }
  285. F32vec1& operator +=(F32vec1 &a) { return *this = _mm_add_ss(vec,a); }
  286. F32vec1& operator -=(F32vec1 &a) { return *this = _mm_sub_ss(vec,a); }
  287. F32vec1& operator *=(F32vec1 &a) { return *this = _mm_mul_ss(vec,a); }
  288. F32vec1& operator /=(F32vec1 &a) { return *this = _mm_div_ss(vec,a); }
  289. F32vec1& operator &=(F32vec1 &a) { return *this = _mm_and_ps(vec,a); }
  290. F32vec1& operator |=(F32vec1 &a) { return *this = _mm_or_ps(vec,a); }
  291. F32vec1& operator ^=(F32vec1 &a) { return *this = _mm_xor_ps(vec,a); }
  292. /* Square Root */
  293. friend F32vec1 sqrt(const F32vec1 &a) { return _mm_sqrt_ss(a); }
  294. /* Reciprocal */
  295. friend F32vec1 rcp(const F32vec1 &a) { return _mm_rcp_ss(a); }
  296. /* Reciprocal Square Root */
  297. friend F32vec1 rsqrt(const F32vec1 &a) { return _mm_rsqrt_ss(a); }
  298. /* NewtonRaphson Reciprocal
  299. [2 * rcpss(x) - (x * rcpss(x) * rcpss(x))] */
  300. friend F32vec1 rcp_nr(const F32vec1 &a)
  301. {
  302. F32vec1 Ra0 = _mm_rcp_ss(a);
  303. return _mm_sub_ss(_mm_add_ss(Ra0, Ra0), _mm_mul_ss(_mm_mul_ss(Ra0, a), Ra0));
  304. }
  305. /* NewtonRaphson Reciprocal Square Root
  306. 0.5 * rsqrtss * (3 - x * rsqrtss(x) * rsqrtss(x)) */
  307. friend F32vec1 rsqrt_nr(const F32vec1 &a)
  308. {
  309. static const F32vec1 fvecf0pt5(0.5f);
  310. static const F32vec1 fvecf3pt0(3.0f);
  311. F32vec1 Ra0 = _mm_rsqrt_ss(a);
  312. return (fvecf0pt5 * Ra0) * (fvecf3pt0 - (a * Ra0) * Ra0);
  313. }
  314. /* Compares: Mask is returned */
  315. /* Macros expand to all compare intrinsics. Example:
  316. friend F32vec1 cmpeq(const F32vec1 &a, const F32vec1 &b)
  317. { return _mm_cmpeq_ss(a,b);} */
  318. #define Fvec32s1_COMP(op) \
  319. friend F32vec1 cmp##op (const F32vec1 &a, const F32vec1 &b) { return _mm_cmp##op##_ss(a,b); }
  320. Fvec32s1_COMP(eq) // expanded to cmpeq(a,b)
  321. Fvec32s1_COMP(lt) // expanded to cmplt(a,b)
  322. Fvec32s1_COMP(le) // expanded to cmple(a,b)
  323. Fvec32s1_COMP(gt) // expanded to cmpgt(a,b)
  324. Fvec32s1_COMP(ge) // expanded to cmpge(a,b)
  325. Fvec32s1_COMP(neq) // expanded to cmpneq(a,b)
  326. Fvec32s1_COMP(nlt) // expanded to cmpnlt(a,b)
  327. Fvec32s1_COMP(nle) // expanded to cmpnle(a,b)
  328. Fvec32s1_COMP(ngt) // expanded to cmpngt(a,b)
  329. Fvec32s1_COMP(nge) // expanded to cmpnge(a,b)
  330. #undef Fvec32s1_COMP
  331. /* Min and Max */
  332. friend F32vec1 simd_min(const F32vec1 &a, const F32vec1 &b) { return _mm_min_ss(a,b); }
  333. friend F32vec1 simd_max(const F32vec1 &a, const F32vec1 &b) { return _mm_max_ss(a,b); }
  334. /* Debug Features */
  335. #if defined(_ENABLE_VEC_DEBUG)
  336. /* Output */
  337. friend std::ostream & operator<<(std::ostream & os, const F32vec1 &a)
  338. {
  339. /* To use: cout << "Elements of F32vec1 fvec are: " << fvec; */
  340. float *fp = (float*)&a;
  341. os << "float:" << *fp;
  342. return os;
  343. }
  344. #endif
  345. };
  346. /* Conditional Selects:*/
  347. /*(a OP b)? c : d; where OP is any compare operator
  348. Macros expand to conditional selects which use all compare intrinsics.
  349. Example:
  350. friend F32vec1 select_eq(const F32vec1 &a, const F32vec1 &b, const F32vec1 &c, const F32vec1 &d)
  351. {
  352. F32vec1 mask = _mm_cmpeq_ss(a,b);
  353. return( (mask & c) | F32vec1((_mm_andnot_ps(mask,d))));
  354. }
  355. */
  356. #define Fvec32s1_SELECT(op) \
  357. inline F32vec1 select_##op (const F32vec1 &a, const F32vec1 &b, const F32vec1 &c, const F32vec1 &d) \
  358. { \
  359. F32vec1 mask = _mm_cmp##op##_ss(a,b); \
  360. return( (mask & c) | F32vec1((_mm_andnot_ps(mask,d)))); \
  361. }
  362. Fvec32s1_SELECT(eq) // generates select_eq(a,b)
  363. Fvec32s1_SELECT(lt) // generates select_lt(a,b)
  364. Fvec32s1_SELECT(le) // generates select_le(a,b)
  365. Fvec32s1_SELECT(gt) // generates select_gt(a,b)
  366. Fvec32s1_SELECT(ge) // generates select_ge(a,b)
  367. Fvec32s1_SELECT(neq) // generates select_neq(a,b)
  368. Fvec32s1_SELECT(nlt) // generates select_nlt(a,b)
  369. Fvec32s1_SELECT(nle) // generates select_nle(a,b)
  370. Fvec32s1_SELECT(ngt) // generates select_ngt(a,b)
  371. Fvec32s1_SELECT(nge) // generates select_nge(a,b)
  372. #undef Fvec32s1_SELECT
  373. /* Conversions between ivec <-> fvec */
  374. /* Convert F32vec1 to int */
  375. inline int F32vec1ToInt(const F32vec1 &a)
  376. {
  377. return _mm_cvtt_ss2si(a);
  378. }
  379. #pragma pack(pop) /* 16-B aligned */
  380. #endif /* FVEC_H_INCLUDED */