Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1534 lines
53 KiB

  1. #include "r_studiosubd_patches.h"
  2. #include "tier1/convar.h"
  3. #include <stdio.h>
  4. #define PI 3.14159265
  5. #ifdef _DEBUG
  6. CUtlVector<Vector4D> g_DebugCornerPositions;
  7. CUtlVector<Vector4D> g_DebugEdgePositions;
  8. CUtlVector<Vector4D> g_DebugInteriorPositions;
  9. #endif
  10. //----------------------------------------------------------------------------------------------
  11. // static stencil buffers
  12. //----------------------------------------------------------------------------------------------
  13. #if !defined( USE_OPT )
  14. static float sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2];
  15. static float sPosEdge1Stencil[MAX_VALENCE+1][6];
  16. static float sPosEdge2Stencil[MAX_VALENCE+1][6];
  17. static float sPosInteriorStencil[MAX_VALENCE+1][4];
  18. static float sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  19. static float sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  20. static float sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  21. static float sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  22. static float sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  23. static float sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  24. static float sPosGregoryInterior1Stencil[6];
  25. static float sPosGregoryInterior2Stencil[6];
  26. static float sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  27. static float sPosEdge1BndStencil[MAX_VALENCE+1][6];
  28. static float sPosEdge2BndStencil[MAX_VALENCE+1][6];
  29. static float sPosInteriorBndStencil[MAX_VALENCE+1][4];
  30. static float sPosEdge1CornerStencil[MAX_VALENCE+1][6];
  31. static float sPosEdge2CornerStencil[MAX_VALENCE+1][6];
  32. #endif
  33. static bool sTableInited = false;
  34. static bool sCornerCorrection = false;
  35. static bool sShowACCGeometryTangents = false;
  36. static bool sUseCornerTangents = true;
  37. void set_ShowACCGeometryTangents(bool v)
  38. {
  39. sShowACCGeometryTangents = v;
  40. }
  41. void set_CornerCorrection(bool v)
  42. {
  43. sCornerCorrection = v;
  44. }
  45. void set_UseCornerTangents(bool v)
  46. {
  47. sUseCornerTangents = v;
  48. }
  49. // averaging function over geometry patch tangents.
  50. static float tangentAveraging( int n, int j)
  51. {
  52. return sin( PI * j / (float) n );
  53. }
  54. //--------------------------------------------------------------------------------------
  55. // Subdiv Stencils
  56. //--------------------------------------------------------------------------------------
  57. #if !defined( USE_OPT )
  58. static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, float *stencilBuffer)
  59. {
  60. VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") );
  61. memset(stencilBuffer, 0, 2*n*sizeof(float));
  62. if (!boundary)
  63. {
  64. float scale = 1.0f / (n*n + 5.0f*n);
  65. stencilBuffer[0] = n*n * scale;
  66. for (int i=0; i<n; i++)
  67. {
  68. stencilBuffer[2*i+1] = 4.0f * scale;
  69. stencilBuffer[2*i+2] = 1.0f * scale;
  70. }
  71. }
  72. else
  73. {
  74. int k = n-1;
  75. float s = 1.0f / 6.0f;
  76. stencilBuffer[0] = s * 4.0f;
  77. stencilBuffer[1] = s * 1.0f;
  78. stencilBuffer[2*k+1] = s * 1.0f;
  79. }
  80. }
  81. static void ComputeCatmullClarkLimitTanStencil(bool bndVtx, bool cornerVtx, const int n, float *stencilBuffer1, float *stencilBuffer2)
  82. {
  83. VPROF_BUDGET( "ComputeCatmullClarkLimitTanStencil", _T("SubD Rendering") );
  84. memset( stencilBuffer1, 0, sizeof(float) * 2*n );
  85. memset( stencilBuffer2, 0, sizeof(float) * 2*n );
  86. if ( !bndVtx )
  87. {
  88. float scale_beta = 1.0f / (n * sqrtf( 4.0f + cos( PI / n ) * cos( PI / n ) ) );
  89. float scale_alpha = 1.0f / n + cos( PI / n ) * scale_beta;
  90. for ( int i=0; i<n; i++ )
  91. {
  92. stencilBuffer1[2*i+1] = cos( 2*PI*i/n ) * scale_alpha;
  93. stencilBuffer1[2*i+2] = cos((2*PI*i+PI)/n ) * scale_beta;
  94. int j = (i - 1)%n;
  95. stencilBuffer2[2*i+1] = cos( 2*PI*j/n ) * scale_alpha;
  96. stencilBuffer2[2*i+2] = cos((2*PI*j+PI)/n ) * scale_beta;
  97. }
  98. }
  99. else
  100. {
  101. // boundary vertex cases
  102. if ( cornerVtx )
  103. {
  104. if ( n<=2 )
  105. return;
  106. float sectorScale = 0, w;
  107. // treat first and last tangent (crease edges) separately
  108. w = tangentAveraging( n-1, 0 ); sectorScale += w;
  109. stencilBuffer1[ 1] += 0.5 * w;
  110. stencilBuffer1[ 0] += -0.5 * w;
  111. w = tangentAveraging( n-1, n-1 ); sectorScale += w;
  112. stencilBuffer1[ 2*(n-1)+1] += 0.5 * w;
  113. stencilBuffer1[ 0 ] += -0.5 * w;
  114. // inner tangents are computed using the 6 weights from the geometery edge construction.
  115. for (int k=1; k<(n-1); k++)
  116. {
  117. w = tangentAveraging( n-1, k ); sectorScale += w;
  118. float scale = 1.0f / (2.0f*n + 10.0f);
  119. stencilBuffer1[ 0] += w * (2.0f*n * scale - 1.0f);
  120. stencilBuffer1[2*(k-1)+1] += w * 2.0f * scale;
  121. stencilBuffer1[2*(k-1)+2] += w * 1.0f * scale;
  122. stencilBuffer1[2*(k-1)+3] += w * 4.0f * scale;
  123. stencilBuffer1[2*(k-1)+4] += w * 1.0f * scale;
  124. stencilBuffer1[2*(k-1)+5] += w * 2.0f * scale;
  125. }
  126. // rescale weights
  127. for (int k = 0; k<2*n; k++)
  128. {
  129. stencilBuffer1[k] /= sectorScale;
  130. }
  131. }
  132. else
  133. {
  134. // special case to avoid colinear tangents
  135. if ( n==2 )
  136. {
  137. float s = 1.0f / 2.0f;
  138. stencilBuffer1[1] = 1.0 * s;
  139. stencilBuffer1[3] =-1.0 * s;
  140. stencilBuffer2[1] =-1.0 * s;
  141. stencilBuffer2[3] = 1.0 * s;
  142. // regularization term to avoid collinearity and preserve limit normal at the boundary
  143. float eps = 1e-4;
  144. stencilBuffer1[0] += eps * (-4.0/3.0);
  145. stencilBuffer1[1] += eps * (1.0/2.0);
  146. stencilBuffer1[2] += eps * (1.0/3.0);
  147. stencilBuffer1[3] += eps * (1.0/2.0);
  148. stencilBuffer2[0] += eps * (-4.0/3.0);
  149. stencilBuffer2[1] += eps * (1.0/2.0);
  150. stencilBuffer2[2] += eps * (1.0/3.0);
  151. stencilBuffer2[3] += eps * (1.0/2.0);
  152. }
  153. else
  154. {
  155. int k = n-1;
  156. float c = cos( PI / k ), s=sin( PI / k );
  157. stencilBuffer1[2*0+1] = 0.5f;
  158. stencilBuffer1[2*k+1] = -0.5f;
  159. stencilBuffer2[0] = -4.0f*s / (3.0f*k + c); // gamma
  160. for (int i=0; i<k; ++i)
  161. {
  162. stencilBuffer2[2*i+1] = 4*sin(PI*i/k)/(3*k+c); // alpha_i
  163. stencilBuffer2[2*i+2] = (sin(PI*i/k)+sin(PI*(i+1)/k)) / (3.0f*k+c); // beta_i
  164. }
  165. stencilBuffer2[2*0+1] = stencilBuffer2[2*k+1] = -( (1+2*c)*sqrt(1+c) ) / ( (3*k+c)*sqrt(1-c) ); // alpha_0, alpha_k
  166. }
  167. }
  168. }
  169. }
  170. static void computeACCEdgePosStencils(byte boundary, byte corner, int n, float *stencilBuffer1, float *stencilBuffer2)
  171. {
  172. VPROF_BUDGET( "ComputeACCEdgePosStencils", _T("SubD Rendering") );
  173. memset(stencilBuffer1, 0, 6*sizeof(float));
  174. memset(stencilBuffer2, 0, 6*sizeof(float));
  175. if ( !boundary )
  176. {
  177. float scale = 1.0f / (2.0f*n + 10.0f);
  178. stencilBuffer1[0] = 2.0f*n * scale; stencilBuffer2[0] = 4.0f * scale;
  179. stencilBuffer1[1] = 2.0f * scale; stencilBuffer2[1] = 1.0f * scale;
  180. stencilBuffer1[2] = 1.0f * scale; stencilBuffer2[2] = 2.0f * scale;
  181. stencilBuffer1[3] = 4.0f * scale; stencilBuffer2[3] = 2.0f*n* scale;
  182. stencilBuffer1[4] = 1.0f * scale; stencilBuffer2[4] = 2.0f * scale;
  183. stencilBuffer1[5] = 2.0f * scale; stencilBuffer2[5] = 1.0f * scale;
  184. }
  185. else
  186. { // boundary stencil
  187. if ( corner )
  188. {
  189. float scale = 1.0f / (3.0f);
  190. stencilBuffer1[0] = 2.0f * scale; stencilBuffer2[0] = 1.0f * scale;
  191. stencilBuffer1[3] = 1.0f * scale; stencilBuffer2[3] = 2.0f * scale;
  192. }
  193. else
  194. {
  195. float scale = 1.0f / 3.0f;
  196. stencilBuffer1[0] = 2.0f * scale; stencilBuffer2[0] = 1.0f * scale;
  197. stencilBuffer1[3] = 1.0f * scale; stencilBuffer2[3] = 2.0f * scale;
  198. }
  199. }
  200. }
  201. static void computeACCInteriorPosStencil(byte boundary, int n, float *stencilBuffer)
  202. {
  203. VPROF_BUDGET( "ComputeACCInteriorPosStencil", _T("SubD Rendering") );
  204. float scale = 1.0f / (n + 5.0f);
  205. stencilBuffer[0] = n * scale;
  206. stencilBuffer[1] = 2.0f * scale;
  207. stencilBuffer[2] = 1.0f * scale;
  208. stencilBuffer[3] = 2.0f * scale;
  209. }
  210. void FillTables()
  211. {
  212. if ( sTableInited ) return;
  213. for ( int val=0; val<=MAX_VALENCE; val++ )
  214. {
  215. // interior stencils
  216. computeCatmullClarkLimitPosStencil(false, val, sPosCornerStencil[val]);
  217. computeACCEdgePosStencils(false, false, val, sPosEdge1Stencil[val], sPosEdge2Stencil[val]);
  218. computeACCInteriorPosStencil(false, val, sPosInteriorStencil[val]);
  219. // boundary stencils
  220. computeCatmullClarkLimitPosStencil(true, val, sPosCornerBndStencil[val]);
  221. computeACCEdgePosStencils(true, false, val, sPosEdge1BndStencil[val], sPosEdge2BndStencil[val]);
  222. computeACCEdgePosStencils(true, true, val, sPosEdge1CornerStencil[val], sPosEdge2CornerStencil[val]);
  223. computeACCInteriorPosStencil(true, val, sPosInteriorBndStencil[val]);
  224. computeCatmullClarkLimitTanStencil(false, false, val, sCCLimitTanStencil1[val], sCCLimitTanStencil2[val]);
  225. computeCatmullClarkLimitTanStencil(true, false, val, sCCLimitTanBndStencil1[val], sCCLimitTanBndStencil2[val]);
  226. computeCatmullClarkLimitTanStencil(true, true, val, sCCLimitTanCornerStencil1[val], sCCLimitTanCornerStencil2[val]);
  227. }
  228. sTableInited = true;
  229. }
  230. //--------------------------------------------------------------------------------------
  231. // Runtime
  232. //--------------------------------------------------------------------------------------
  233. #ifdef _DEBUG
  234. static ConVar mat_tess_dump( "mat_tess_dump", "0", FCVAR_CHEAT );
  235. #endif
  236. // Compute corner control points for each patch
  237. inline void ComputeCatmullClarkLimitPosition( Vector4D *pPos, unsigned short *oneRing,
  238. unsigned short vtx1RingSize, unsigned short minOneRingIndex, unsigned short bndVtx,
  239. unsigned short cornerVtx, unsigned short valence, unsigned short nbCorners, Vector4D &limitPos )
  240. {
  241. VPROF_BUDGET( "ComputeCatmullClarkLimitPosition", _T("SubD Rendering") );
  242. if ( cornerVtx > 0 )
  243. {
  244. limitPos = pPos[ oneRing[0] ];
  245. }
  246. else
  247. {
  248. assert( valence <= MAX_VALENCE );
  249. float *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ];
  250. // pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer)
  251. limitPos = pStencil[0] * pPos[ oneRing[0] ];
  252. for ( int k = 0; k < vtx1RingSize; k++ )
  253. {
  254. int idx = ( k + minOneRingIndex ) % vtx1RingSize; // Shuffle to get the minimum index consistently first in order
  255. if ( idx != 0 ) // Don't do pStencil[0] again
  256. {
  257. limitPos += pStencil[idx] * pPos[ oneRing[idx] ];
  258. }
  259. }
  260. }
  261. #ifdef _DEBUG
  262. g_DebugCornerPositions.AddToTail( limitPos );
  263. #endif
  264. }
  265. inline Vector4D CrossProduct(const Vector4D& a, const Vector4D& b)
  266. {
  267. return Vector4D( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f );
  268. }
  269. inline float VectorNormalize(Vector4D& vec)
  270. {
  271. float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);
  272. // FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
  273. float iradius = 1.f / ( radius + FLT_EPSILON );
  274. vec.x *= iradius;
  275. vec.y *= iradius;
  276. vec.z *= iradius;
  277. return radius;
  278. }
  279. FORCEINLINE float DotProduct(const Vector4D& a, const Vector4D& b)
  280. {
  281. return ( a.x*b.x + a.y*b.y + a.z*b.z );
  282. }
  283. inline void ComputeCatmullClarkLimitTangents( int idx, Vector4D *pPos, unsigned short *oneRing, unsigned short vtx1RingSize,
  284. unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx,
  285. unsigned short valence, unsigned short &loopGapAngle,
  286. Vector4D &limitTanU, Vector4D &limitTanV )
  287. {
  288. // for valence=1, no need to have separate tangents
  289. float tanUSign[] = {1,-1,-1,1};
  290. float tanVSign[] = {1,1,-1,-1};
  291. VPROF_BUDGET( "ComputeCatmullClarkLimitTangents", _T("SubD Rendering") );
  292. if ( !sUseCornerTangents )
  293. cornerVtx = 0;
  294. if ( !bndVtx ) // interior vertices
  295. {
  296. float *stencil1 = sCCLimitTanStencil1[ valence ];
  297. float *stencil2 = sCCLimitTanStencil2[ valence ];
  298. limitTanU = Vector4D(0,0,0,0);
  299. limitTanV = Vector4D(0,0,0,0);
  300. for (int k = 0; k < vtx1RingSize; ++k)
  301. {
  302. limitTanU += stencil1[k] * pPos[ oneRing[k] ];
  303. limitTanV += stencil2[k] * pPos[ oneRing[k] ];
  304. }
  305. }
  306. else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) ) // smooth boundary vertices
  307. {
  308. float *stencil1 = sCCLimitTanBndStencil1[ valence ];
  309. float *stencil2 = sCCLimitTanBndStencil2[ valence ];
  310. Vector4D r0 = Vector4D(0,0,0,0);
  311. Vector4D r1 = Vector4D(0,0,0,0);
  312. for (int k = 0; k < vtx1RingSize; ++k)
  313. {
  314. r0 += stencil1[k] * pPos[ oneRing[k] ];
  315. r1 += stencil2[k] * pPos[ oneRing[k] ];
  316. }
  317. int j1 = (centerOffset - 1) / 2;
  318. int j2 = j1+1;
  319. int K = (valence - 1);
  320. if (valence == 2)
  321. {
  322. limitTanU = r0;
  323. limitTanV = r1;
  324. }
  325. else
  326. {
  327. limitTanU = cos(PI*j1 / K) * r0 + sin(PI*j1 / K) * r1;
  328. limitTanV = cos(PI*j2 / K) * r0 + sin(PI*j2 / K) * r1;
  329. }
  330. }
  331. else // corner vertices
  332. {
  333. if ( valence == 2 )
  334. return;
  335. float *pEdgeStencil = sPosEdge1Stencil[ valence ];
  336. // float *avgStencil = sCCLimitTanCornerStencil1[ valence ];
  337. // compute tangents
  338. Vector4D c0 = pPos[ oneRing[1] ] - pPos[ oneRing[0] ]; c0.w = 0;
  339. Vector4D c1 = pPos[ oneRing[vtx1RingSize - 1] ] - pPos[ oneRing[0] ]; c1.w = 0;
  340. Vector4D e0 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ];
  341. Vector4D e1 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ];
  342. for (int k = 1; k < 6; k++ )
  343. {
  344. e0 += pEdgeStencil[k] * pPos[ oneRing[ k ] ];
  345. e1 += pEdgeStencil[k] * pPos[ oneRing[ vtx1RingSize - 6 + k ] ];
  346. }
  347. e0.w = 0; e1.w = 0;
  348. // compute average tangent plane normal
  349. Vector4D n0 = CrossProduct( c0, e0 ); VectorNormalize( n0 );
  350. Vector4D n1 = CrossProduct( e1, c1 ); VectorNormalize( n1 );
  351. Vector4D N = n0 + n1;
  352. // N = N - ( DotProduct( N, tAvg )/ DotProduct(tAvg, tAvg) ) * tAvg;
  353. VectorNormalize( N );
  354. // project into tangent plane
  355. c0 = c0 - DotProduct(c0, N) * N;
  356. c1 = c1 - DotProduct(c1, N) * N;
  357. float c0l = Vector4DLength( c0 ); c0 = c0 / c0l;
  358. float c1l = Vector4DLength( c1 ); c1 = c1 / c1l;
  359. float cAvg = (c0l + c1l) / 2;
  360. // compute angle
  361. Vector4D c0p = CrossProduct(N, c0);
  362. float angle = PI - atan2( DotProduct(c0p, c1), -DotProduct(c0, c1) );
  363. loopGapAngle = (unsigned int) ((65535.0 * angle) / (2*PI));
  364. // compute final tangent vector
  365. int j1 = (centerOffset - 1) / 2;
  366. int j2 = j1+1;
  367. int K = (valence - 1);
  368. limitTanU = cAvg * ( cos(angle*j1 / K) * c0 + sin(angle*j1 / K) * c0p );
  369. limitTanV = cAvg * ( cos(angle*j2 / K) * c0 + sin(angle*j2 / K) * c0p );
  370. }
  371. // flip tangents so they point in u/v direction
  372. if ( idx & 1 )
  373. {
  374. swap(limitTanU, limitTanV);
  375. }
  376. limitTanU *= tanUSign[idx];
  377. limitTanV *= tanVSign[idx];
  378. }
  379. inline void ComputeACCEdgePositions( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset,
  380. unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1,
  381. unsigned short cornerVtx0, unsigned short cornerVtx1, unsigned short loopGapAngle0, unsigned short loopGapAngle1,
  382. unsigned short edgeBias0, unsigned short edgeBias1, unsigned short val0, unsigned short val1,
  383. unsigned short minOneRingOffset, unsigned short vtx1RingSize,
  384. Vector4D &edgePos0, Vector4D &edgePos1)
  385. {
  386. VPROF_BUDGET( "ComputeACCEdgePositions", _T("SubD Rendering") );
  387. if ( bndVtx0 )
  388. {
  389. val0 = 2*(val0 - 1);
  390. }
  391. if ( bndVtx1 )
  392. {
  393. val1 = 2*(val1 - 1);
  394. }
  395. Assert( val0 <= MAX_VALENCE );
  396. Assert( val1 <= MAX_VALENCE );
  397. float* pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ];
  398. float* pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ];
  399. int kEnd = (bndEdge) ? 4 : 6;
  400. if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) )
  401. {
  402. int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 };
  403. for ( int i = 1; i < kEnd; i++ )
  404. {
  405. oneRingIndex[i] = centerOffset + i - 1;
  406. }
  407. edgePos0 = edgePos1 = Vector4D(0,0,0,0);
  408. for ( int k = 0; k < kEnd; k++ )
  409. {
  410. int idx = ( k + minOneRingOffset ) % kEnd; // Offset to min index to enforce evaluation order between neighboring patches
  411. edgePos0 += pStencil0[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ];
  412. edgePos1 += pStencil1[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ];
  413. }
  414. }
  415. else
  416. {
  417. float b0, b1;
  418. b1 = edgeBias0 / 32768.0, b0 = 1.0f-b1;
  419. edgePos0 = (val0 * pPos[ oneRing[0] ] + 2*b0*pPos[ oneRing[centerOffset + 0] ] + 1*b0*pPos[ oneRing[centerOffset + 1] ] + 2*pPos[ oneRing[centerOffset + 2] ] + 1*b1*pPos[ oneRing[centerOffset + 3] ] + 2*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val0 + 5.0f);
  420. b1 = edgeBias1 / 32768.0, b0 = 1.0f-b1;
  421. edgePos1 = ( 2 * pPos[ oneRing[0] ] + 1*b0*pPos[ oneRing[centerOffset + 0] ] + 2*b0*pPos[ oneRing[centerOffset + 1] ] + val1*pPos[ oneRing[centerOffset + 2] ] + 2*b1*pPos[ oneRing[centerOffset + 3] ] + 1*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val1 + 5.0f);
  422. }
  423. #ifdef _DEBUG
  424. g_DebugEdgePositions.AddToTail( edgePos0 );
  425. g_DebugEdgePositions.AddToTail( edgePos1 );
  426. #endif
  427. }
  428. inline void ComputeACCInteriorPosition( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, Vector4D &interiorPos )
  429. {
  430. VPROF_BUDGET( "ComputeACCInteriorPosition", _T("SubD Rendering") );
  431. if ( bndVtx )
  432. {
  433. valence = valence>2 ? 2*(valence - 1) : 4*(valence - 1);
  434. }
  435. Assert( valence<=MAX_VALENCE );
  436. float *stencil = sPosInteriorStencil[ valence ];
  437. interiorPos = stencil[0] * pPos[ oneRing[0] ];
  438. for ( int k = 1; k < 4; ++k )
  439. {
  440. interiorPos += stencil[k] * pPos[ oneRing[ centerOffset + k - 1 ] ];
  441. }
  442. #ifdef _DEBUG
  443. g_DebugInteriorPositions.AddToTail( interiorPos );
  444. #endif
  445. }
  446. inline void ComputeACCGeometryPatchTangents( Vector4D *Pos, Vector4D *TanU, Vector4D *TanV )
  447. {
  448. VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") );
  449. for ( int j=0; j<3; j++ )
  450. {
  451. for ( int i=0; i<4; i++ )
  452. {
  453. TanU[i*3+j] = 3*( Pos[i*4+j+1] - Pos[i*4+j] );
  454. TanV[j*4+i] = 3*( Pos[(j+1)*4+i] - Pos[j*4+i] );
  455. }
  456. }
  457. }
  458. void ComputeACCGeometryPatch( Vector4D* pPos, TopologyIndexStruct *quad, Vector4D* Pos)
  459. {
  460. VPROF_BUDGET( "ComputeACCGeometryPatch", _T("SubD Rendering") );
  461. int MOD4[8] = {0,1,2,3,0,1,2,3};
  462. int accCorner[] = {0,3,15,12};
  463. int accEdge1[] = {4,2,11,13};
  464. int accEdge2[] = {8,1,7,14};
  465. int accInterior[] = {5,6,10,9};
  466. int vtx1RingStart = 0;
  467. unsigned short *oneRing = quad->oneRing;
  468. for ( int i=0; i<4; i++ ) // 4 corner vertices
  469. {
  470. ComputeCatmullClarkLimitPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingSize[i], quad->minOneRingOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->nbCornerVtx[i], Pos[ accCorner[i] ] );
  471. ComputeACCEdgePositions( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i],
  472. quad->bndEdge[ MOD4[i+3] ],
  473. quad->bndVtx[i], quad->bndVtx[MOD4[i+3]],
  474. quad->cornerVtx[i], quad->cornerVtx[MOD4[i+3]],
  475. quad->loopGapAngle[i], quad->loopGapAngle[MOD4[i+3]],
  476. quad->edgeBias[ 2*MOD4[i+3] ], quad->edgeBias[ 2*MOD4[i+3] + 1 ],
  477. quad->valences[i], quad->valences[MOD4[i+3]],
  478. quad->minOneRingOffset[i], quad->vtx1RingSize[i],
  479. Pos[accEdge1[i]], Pos[accEdge2[i]] );
  480. ComputeACCInteriorPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->loopGapAngle[i], quad->valences[i], Pos[ accInterior[i] ] );
  481. vtx1RingStart += quad->vtx1RingSize[i];
  482. }
  483. }
  484. void ComputeACCTangentPatches( Vector4D* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV )
  485. {
  486. VPROF_BUDGET( "ComputeACCTangentPatches", _T("SubD Rendering") );
  487. int MOD4[8] = {0,1,2,3,0,1,2,3};
  488. int accTanCornerU[] = {0,2,11,9}; // counterclockwise orders!
  489. int accTanCornerV[] = {0,3,11,8};
  490. unsigned short *oneRing = quad->oneRing;
  491. ComputeACCGeometryPatchTangents(Pos, TanU, TanV);
  492. #if !defined( NO_TANGENTS )
  493. if ( !sShowACCGeometryTangents )
  494. {
  495. // compute corner tangents ( = subdivision surface limit tangents)
  496. int vtx1RingStart = 0;
  497. for ( int i=0; i<4; i++ )
  498. {
  499. int vtx1RingSize = quad->vtx1RingSize[i];
  500. Vector4D &accTanU = TanU[ accTanCornerU[i] ];
  501. Vector4D &accTanV = TanV[ accTanCornerV[i] ];
  502. ComputeCatmullClarkLimitTangents(i, pPos, &oneRing[vtx1RingStart], vtx1RingSize, quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->loopGapAngle[i], accTanU, accTanV );
  503. vtx1RingStart += vtx1RingSize;
  504. }
  505. // compute correction component to boundary tangents for tangent plane continuity
  506. // /TanV/ /TanU/ / TanV / /TanU/
  507. static int CB_CornerIdx[] = {0,1,2, 3,7,11, 11,10,9, 8,4,0 };
  508. static int CB_InteriorIdx[] = {1,2, 5,8, 10,9, 6,3 };
  509. static float CB_sign[] = {1,-1,1,-1};
  510. for ( int i=0; i<4; i++ ) // for all quad edges
  511. {
  512. if ( !quad->bndEdge[i] )
  513. {
  514. Vector4D *CBTanV = (i&1) ? TanU : TanV;
  515. Vector4D *CBTanU = (i&1) ? TanV : TanU;
  516. Vector4D u00 = CBTanU[CB_CornerIdx[3*i + 0]];
  517. Vector4D u10 = CBTanU[CB_CornerIdx[3*i + 1]];
  518. Vector4D u20 = CBTanU[CB_CornerIdx[3*i + 2]];
  519. int val0 = quad->valences[i];
  520. int val1 = quad->valences[MOD4[i+1]];
  521. if ( quad->bndVtx[i] )
  522. val0--;
  523. if ( quad->bndVtx[MOD4[i+1]] )
  524. val1--;
  525. float c0 = cos( (2*PI * quad->loopGapAngle[ i ] / 65535.0f) / val0 );
  526. float c1 = cos( (2*PI * quad->loopGapAngle[MOD4[i+1]] / 65535.0f) / val1 );
  527. CBTanV[ CB_InteriorIdx[2*i + 0] ] += CB_sign[i]*( 2*c0*u10 - c1*u00 )/3.0f;
  528. CBTanV[ CB_InteriorIdx[2*i + 1] ] += CB_sign[i]*( c0*u20 - 2*c1*u10 )/3.0f;
  529. }
  530. }
  531. }
  532. #endif
  533. }
  534. #endif // !defined( USE_OPT )
  535. #if defined( USE_OPT )
  536. #define M_PI2 6.28318530717958647692f
  537. static fltx4 Four_NegativeThirds;
  538. static fltx4 Four_Fives;
  539. static fltx4 Four_Tens;
  540. static fltx4 Four_N[32];
  541. static fltx4 Four_TwoPI;
  542. static fltx4 Four_Valence[MAX_VALENCE];
  543. static fltx4 Four_ValencePlus5[MAX_VALENCE];
  544. static fltx4 sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2];
  545. static fltx4 sPosEdge1Stencil[MAX_VALENCE+1][6];
  546. static fltx4 sPosEdge2Stencil[MAX_VALENCE+1][6];
  547. static fltx4 sPosInteriorStencil[MAX_VALENCE+1][4];
  548. static fltx4 sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  549. static fltx4 sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  550. static fltx4 sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  551. static fltx4 sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  552. static fltx4 sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  553. static fltx4 sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  554. static fltx4 sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
  555. static fltx4 sPosEdge1BndStencil[MAX_VALENCE+1][6];
  556. static fltx4 sPosEdge2BndStencil[MAX_VALENCE+1][6];
  557. static fltx4 sPosInteriorBndStencil[MAX_VALENCE+1][4];
  558. static fltx4 sPosEdge1CornerStencil[MAX_VALENCE+1][6];
  559. static fltx4 sPosEdge2CornerStencil[MAX_VALENCE+1][6];
  560. static fltx4 sCCSinPI[MAX_VALENCE*2][MAX_VALENCE];
  561. static fltx4 sCCCosPI[MAX_VALENCE*2][MAX_VALENCE];
  562. static float Valence_MinusOne[MAX_VALENCE];
  563. static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, fltx4 *stencilBuffer)
  564. {
  565. VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") );
  566. for ( int i=0; i<2*n; ++i )
  567. {
  568. stencilBuffer[i] = Four_Zeros;
  569. }
  570. if ( !boundary )
  571. {
  572. float scale = 1.0f / (n*n + 5.0f*n);
  573. stencilBuffer[0] = ReplicateX4( n*n * scale );
  574. for ( int i=0; i<n; i++ )
  575. {
  576. stencilBuffer[2*i+1] = ReplicateX4( 4.0f * scale );
  577. stencilBuffer[2*i+2] = ReplicateX4( 1.0f * scale );
  578. }
  579. }
  580. else
  581. {
  582. int k = n-1;
  583. float s = 1.0f / 6.0f;
  584. stencilBuffer[0] = ReplicateX4( s * 4.0f );
  585. stencilBuffer[1] = ReplicateX4( s * 1.0f );
  586. stencilBuffer[2*k+1] = ReplicateX4( s * 1.0f );
  587. }
  588. }
  589. static void ComputeCatmullClarkLimitTanStencil(bool bndVtx, bool cornerVtx, const int n, fltx4 *stencilBuffer1, fltx4 *stencilBuffer2)
  590. {
  591. VPROF_BUDGET( "ComputeCatmullClarkLimitTanStencil", _T("SubD Rendering") );
  592. for ( int i=0; i<2*n; ++i )
  593. {
  594. stencilBuffer1[i] = Four_Zeros;
  595. stencilBuffer2[i] = Four_Zeros;
  596. }
  597. if ( !bndVtx )
  598. {
  599. float scale_beta = 1.0f / (n * sqrtf(4.0f + cos(PI/n)*cos(PI/n)));
  600. float scale_alpha = 1.0f/n + cos(PI/n) * scale_beta;
  601. for ( int i=0; i<n; i++ )
  602. {
  603. stencilBuffer1[2*i+1] = ReplicateX4( cos( 2*PI*i/n ) * scale_alpha );
  604. stencilBuffer1[2*i+2] = ReplicateX4( cos((2*PI*i+PI)/n ) * scale_beta );
  605. int j = (i - 1)%n;
  606. stencilBuffer2[2*i+1] = ReplicateX4( cos( 2*PI*j/n ) * scale_alpha );
  607. stencilBuffer2[2*i+2] = ReplicateX4( cos((2*PI*j+PI)/n ) * scale_beta );
  608. }
  609. }
  610. else
  611. {
  612. // boundary vertex cases
  613. if ( cornerVtx )
  614. {
  615. if ( n<=2 )
  616. return;
  617. float sectorScale = 0, w;
  618. // treat first and last tangent (crease edges) separately
  619. w = tangentAveraging( n-1, 0 ); sectorScale += w;
  620. stencilBuffer1[ 1] = stencilBuffer1[ 1] + ReplicateX4( 0.5 * w );
  621. stencilBuffer1[ 0] = stencilBuffer1[ 0] + ReplicateX4( -0.5 * w );
  622. w = tangentAveraging( n-1, n-1 ); sectorScale += w;
  623. stencilBuffer1[ 2*(n-1)+1] = stencilBuffer1[ 2*(n-1)+1] + ReplicateX4( 0.5 * w );
  624. stencilBuffer1[ 0 ] = stencilBuffer1[ 0 ] + ReplicateX4( -0.5 * w );
  625. // inner tangents are computed using the 6 weights from the geometery edge construction.
  626. for (int k=1; k<(n-1); k++)
  627. {
  628. w = tangentAveraging( n-1, k ); sectorScale += w;
  629. float scale = 1.0f / (2.0f*n + 10.0f);
  630. stencilBuffer1[ 0] = stencilBuffer1[ 0] + ReplicateX4( w * (2.0f*n * scale - 1.0f) );
  631. stencilBuffer1[2*(k-1)+1] = stencilBuffer1[2*(k-1)+1] + ReplicateX4( w * 2.0f * scale );
  632. stencilBuffer1[2*(k-1)+2] = stencilBuffer1[2*(k-1)+2] + ReplicateX4( w * 1.0f * scale );
  633. stencilBuffer1[2*(k-1)+3] = stencilBuffer1[2*(k-1)+3] + ReplicateX4( w * 4.0f * scale );
  634. stencilBuffer1[2*(k-1)+4] = stencilBuffer1[2*(k-1)+4] + ReplicateX4( w * 1.0f * scale );
  635. stencilBuffer1[2*(k-1)+5] = stencilBuffer1[2*(k-1)+5] + ReplicateX4( w * 2.0f * scale );
  636. }
  637. // rescale weights
  638. fltx4 fltx4Scale = ReplicateX4( sectorScale );
  639. for ( int k = 0; k<2*n; ++k )
  640. {
  641. stencilBuffer1[k] = DivSIMD( stencilBuffer1[k], fltx4Scale );
  642. }
  643. }
  644. else
  645. {
  646. // special case to avoid colinear tangents
  647. if ( n==2 )
  648. {
  649. float s = 1.0f / 2.0f;
  650. stencilBuffer1[1] = ReplicateX4( 1.0 * s );
  651. stencilBuffer1[3] = ReplicateX4( -1.0 * s );
  652. stencilBuffer2[1] = ReplicateX4( -1.0 * s );
  653. stencilBuffer2[3] = ReplicateX4( 1.0 * s );
  654. // regularization term to avoid collinearity and preserve limit normal at the boundary
  655. float eps = 1e-4;
  656. stencilBuffer1[0] = AddSIMD( stencilBuffer1[0], ReplicateX4( eps * (-4.0/3.0) ) );
  657. stencilBuffer1[1] = AddSIMD( stencilBuffer1[1], ReplicateX4( eps * (1.0/2.0) ) );
  658. stencilBuffer1[2] = AddSIMD( stencilBuffer1[2], ReplicateX4( eps * (1.0/3.0) ) );
  659. stencilBuffer1[3] = AddSIMD( stencilBuffer1[3], ReplicateX4( eps * (1.0/2.0) ) );
  660. stencilBuffer2[0] = AddSIMD( stencilBuffer2[0], ReplicateX4( eps * (-4.0/3.0) ) );
  661. stencilBuffer2[1] = AddSIMD( stencilBuffer2[1], ReplicateX4( eps * (1.0/2.0) ) );
  662. stencilBuffer2[2] = AddSIMD( stencilBuffer2[2], ReplicateX4( eps * (1.0/3.0) ) );
  663. stencilBuffer2[3] = AddSIMD( stencilBuffer2[3], ReplicateX4( eps * (1.0/2.0) ) );
  664. }
  665. else
  666. {
  667. int k = n-1;
  668. float c = cos( PI / k ), s=sin( PI / k );
  669. stencilBuffer1[2*0+1] = ReplicateX4( 0.5f );
  670. stencilBuffer1[2*k+1] = ReplicateX4( -0.5f );
  671. stencilBuffer2[0] = ReplicateX4( -4.0f*s / (3.0f*k + c) ); // gamma
  672. for ( int i=0; i<k; ++i )
  673. {
  674. stencilBuffer2[2*i+1] = ReplicateX4( 4*sin(PI*i/k)/(3*k+c) ); // alpha_i
  675. stencilBuffer2[2*i+2] = ReplicateX4( (sin(PI*i/k)+sin(PI*(i+1)/k)) / (3.0f*k+c) ); // beta_i
  676. }
  677. stencilBuffer2[2*0+1] = stencilBuffer2[2*k+1] = ReplicateX4( -( (1+2*c)*sqrt(1+c) ) / ( (3*k+c)*sqrt(1-c) ) ); // alpha_0, alpha_k
  678. }
  679. }
  680. }
  681. }
  682. static void ComputeACCEdgePosStencils(byte boundary, byte corner, int n, fltx4 *stencilBuffer1, fltx4 *stencilBuffer2)
  683. {
  684. VPROF_BUDGET( "ComputeACCEdgePosStencils", _T("SubD Rendering") );
  685. for ( int i=0; i<6; ++i )
  686. {
  687. stencilBuffer1[i] = Four_Zeros;
  688. stencilBuffer2[i] = Four_Zeros;
  689. }
  690. if ( !boundary )
  691. {
  692. float scale = 1.0f / (2.0f*n + 10.0f);
  693. stencilBuffer1[0] = ReplicateX4( 2.0f*n * scale ); stencilBuffer2[0] = ReplicateX4( 4.0f * scale );
  694. stencilBuffer1[1] = ReplicateX4( 2.0f * scale ); stencilBuffer2[1] = ReplicateX4( 1.0f * scale );
  695. stencilBuffer1[2] = ReplicateX4( 1.0f * scale ); stencilBuffer2[2] = ReplicateX4( 2.0f * scale );
  696. stencilBuffer1[3] = ReplicateX4( 4.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f*n* scale );
  697. stencilBuffer1[4] = ReplicateX4( 1.0f * scale ); stencilBuffer2[4] = ReplicateX4( 2.0f * scale );
  698. stencilBuffer1[5] = ReplicateX4( 2.0f * scale ); stencilBuffer2[5] = ReplicateX4( 1.0f * scale );
  699. }
  700. else
  701. {
  702. // boundary stencil
  703. if ( corner )
  704. {
  705. float scale = 1.0f / (3.0f);
  706. stencilBuffer1[0] = ReplicateX4( 2.0f * scale ); stencilBuffer2[0] = ReplicateX4( 1.0f * scale );
  707. stencilBuffer1[3] = ReplicateX4( 1.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f * scale );
  708. }
  709. else
  710. {
  711. float scale = 1.0f / 3.0f;
  712. stencilBuffer1[0] = ReplicateX4( 2.0f * scale ); stencilBuffer2[0] = ReplicateX4( 1.0f * scale );
  713. stencilBuffer1[3] = ReplicateX4( 1.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f * scale );
  714. }
  715. }
  716. }
  717. static void ComputeACCInteriorPosStencil(byte boundary, int n, fltx4 *stencilBuffer)
  718. {
  719. VPROF_BUDGET( "ComputeACCInteriorPosStencil", _T("SubD Rendering") );
  720. float scale = 1.0f / (n + 5.0f);
  721. stencilBuffer[0] = ReplicateX4( n * scale );
  722. stencilBuffer[1] = ReplicateX4( 2.0f * scale );
  723. stencilBuffer[2] = ReplicateX4( 1.0f * scale );
  724. stencilBuffer[3] = ReplicateX4( 2.0f * scale );
  725. }
  726. static void ComputeACCSinCosPITables()
  727. {
  728. fltx4 PI4 = ReplicateX4( M_PI );
  729. for ( int j=0; j<MAX_VALENCE*2; ++j )
  730. {
  731. fltx4 j4 = ReplicateX4( (float)j );
  732. for ( int k=0; k<MAX_VALENCE; ++k )
  733. {
  734. fltx4 k4 = ReplicateX4( (float)k );
  735. fltx4 radians = DivSIMD( MulSIMD( PI4, j4 ), k4 );
  736. // not really simd
  737. SinCosSIMD( sCCSinPI[j][k], sCCCosPI[j][k], radians );
  738. }
  739. }
  740. }
  741. void FillTables()
  742. {
  743. if ( sTableInited )
  744. return;
  745. // Some simd stuff
  746. Four_TwoPI = ReplicateX4( 2*M_PI );
  747. Four_Tens = ReplicateX4( 10.0f );
  748. Four_Fives = ReplicateX4( 5 );
  749. Four_NegativeThirds = ReplicateX4( -0.333333333333333f );
  750. for ( int i=0; i<32; ++i )
  751. {
  752. Four_N[i] = ReplicateX4( (float)i );
  753. }
  754. for ( int i=0; i<MAX_VALENCE; ++i )
  755. {
  756. Four_Valence[i] = ReplicateX4( (float)i );
  757. Four_ValencePlus5[i] = ReplicateX4( (float)i + 5.0f );
  758. Valence_MinusOne[i] = (float)(i-1);
  759. }
  760. for ( int val=0; val<=MAX_VALENCE; val++ )
  761. {
  762. // interior stencils
  763. ComputeCatmullClarkLimitPosStencil( false, val, sPosCornerStencil[val] );
  764. ComputeACCEdgePosStencils( false, false, val, sPosEdge1Stencil[val], sPosEdge2Stencil[val] );
  765. ComputeACCInteriorPosStencil( false, val, sPosInteriorStencil[val] );
  766. // boundary stencils
  767. ComputeCatmullClarkLimitPosStencil( true, val, sPosCornerBndStencil[val] );
  768. ComputeACCEdgePosStencils( true, false, val, sPosEdge1BndStencil[val], sPosEdge2BndStencil[val] );
  769. ComputeACCEdgePosStencils( true, true, val, sPosEdge1CornerStencil[val], sPosEdge2CornerStencil[val] );
  770. ComputeACCInteriorPosStencil( true, val, sPosInteriorBndStencil[val] );
  771. ComputeCatmullClarkLimitTanStencil( false, false, val, sCCLimitTanStencil1[val], sCCLimitTanStencil2[val] );
  772. ComputeCatmullClarkLimitTanStencil( true, false, val, sCCLimitTanBndStencil1[val], sCCLimitTanBndStencil2[val] );
  773. ComputeCatmullClarkLimitTanStencil( true, true, val, sCCLimitTanCornerStencil1[val], sCCLimitTanCornerStencil2[val] );
  774. }
  775. // sincos tables
  776. ComputeACCSinCosPITables();
  777. sTableInited = true;
  778. }
  779. //--------------------------------------------------------------------------------------
  780. // Runtime
  781. //--------------------------------------------------------------------------------------
  782. FORCEINLINE void ComputeCatmullClarkLimitPosition( fltx4 *pPos, unsigned short *pOneRing,
  783. unsigned short vtx1RingSize, unsigned short minOneRingIndex, unsigned short bndVtx,
  784. unsigned short cornerVtx, unsigned short valence, fltx4 &limitPos )
  785. {
  786. VPROF_BUDGET( "ComputeCatmullClarkLimitPosition (SIMD)", _T( "SubD Rendering" ) );
  787. assert( pPos );
  788. assert( pOneRing );
  789. if ( cornerVtx > 0 )
  790. {
  791. limitPos = pPos[ pOneRing[0] ];
  792. }
  793. else
  794. {
  795. assert( valence <= MAX_VALENCE );
  796. fltx4 *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ];
  797. // pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer)
  798. limitPos = MulSIMD( pStencil[0], pPos[ pOneRing[0] ] );
  799. for ( int k = 0; k < vtx1RingSize; k++ )
  800. {
  801. int idx = ( k + minOneRingIndex ) % vtx1RingSize; // Shuffle to get the minimum index consistently first in order
  802. if ( idx != 0 ) // Don't do pStencil[0] again
  803. {
  804. limitPos = MaddSIMD( pStencil[idx], pPos[ pOneRing[idx] ], limitPos );
  805. }
  806. }
  807. }
  808. }
  809. FORCEINLINE fltx4 VectorNormalize( fltx4 &A )
  810. {
  811. fltx4 mag_sq = Dot3SIMD( A, A ); // length^2
  812. fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq);
  813. return MulSIMD( A, invSqrt );
  814. }
  815. FORCEINLINE fltx4 VectorLength( fltx4 &A )
  816. {
  817. fltx4 mag_sq = Dot3SIMD( A, A ); // length^2
  818. fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq);
  819. return invSqrt;
  820. }
  821. FORCEINLINE fltx4 CrossProduct( const fltx4 &A, const fltx4 &B )
  822. {
  823. #if defined( _X360 )
  824. return XMVector3Cross( A, B );
  825. #elif defined( _WIN32 )
  826. fltx4 A1 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );
  827. fltx4 B1 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );
  828. fltx4 Result1 = MulSIMD( A1, B1 );
  829. fltx4 A2 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );
  830. fltx4 B2 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );
  831. fltx4 Result2 = MulSIMD( A2, B2 );
  832. return SubSIMD( Result1, Result2 );
  833. #else
  834. fltx4 CrossVal;
  835. SubFloat( CrossVal, 0 ) = SubFloat( A, 1 )*SubFloat( B, 2 ) - SubFloat( A, 2 )*SubFloat( B, 1 );
  836. SubFloat( CrossVal, 1 ) = SubFloat( A, 2 )*SubFloat( B, 0 ) - SubFloat( A, 0 )*SubFloat( B, 2 );
  837. SubFloat( CrossVal, 2 ) = SubFloat( A, 0 )*SubFloat( B, 1 ) - SubFloat( A, 1 )*SubFloat( B, 0 );
  838. SubFloat( CrossVal, 3 ) = 0;
  839. return CrossVal;
  840. #endif
  841. }
  842. FORCEINLINE void ComputeCatmullClarkLimitTangents( int idx, fltx4 *pPos, unsigned short *pOneRing, unsigned short vtx1RingSize,
  843. unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx,
  844. unsigned short valence, float &loopGapAngle, fltx4 &limitTanU, fltx4 &limitTanV )
  845. {
  846. VPROF_BUDGET( "ComputeCatmullClarkLimitTangents (SIMD)", _T( "SubD Rendering" ) );
  847. // for valence=1, no need to have separate tangents
  848. static const fltx4 tanUSign[4] = { Four_Ones, Four_NegativeOnes, Four_NegativeOnes, Four_Ones };
  849. static const fltx4 tanVSign[4] = { Four_Ones, Four_Ones, Four_NegativeOnes, Four_NegativeOnes };
  850. if (!sUseCornerTangents) cornerVtx = 0;
  851. // interior vertices
  852. if ( !bndVtx )
  853. {
  854. fltx4 *pStencil0 = sCCLimitTanStencil1[ valence ];
  855. fltx4 *pStencil1 = sCCLimitTanStencil2[ valence ];
  856. limitTanU = limitTanV = Four_Zeros;
  857. for ( int k = 0; k < vtx1RingSize; k++ )
  858. {
  859. limitTanU = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], limitTanU );
  860. limitTanV = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], limitTanV );
  861. }
  862. }
  863. else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) )
  864. {
  865. // smooth boundary vertices
  866. fltx4 *pStencil0 = sCCLimitTanBndStencil1[ valence ];
  867. fltx4 *pStencil1 = sCCLimitTanBndStencil2[ valence ];
  868. fltx4 r0 = Four_Zeros;
  869. fltx4 r1 = Four_Zeros;
  870. for (int k = 0; k < vtx1RingSize; ++k)
  871. {
  872. r0 = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], r0 );
  873. r1 = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], r1 );
  874. }
  875. int j1 = ( centerOffset - 1 ) / 2;
  876. int j2 = j1 + 1;
  877. int k = valence - 1;
  878. if ( valence == 2 )
  879. {
  880. limitTanU = r0;
  881. limitTanV = r1;
  882. }
  883. else
  884. {
  885. limitTanU = AddSIMD( MulSIMD( sCCCosPI[j1][k], r0 ), MulSIMD( sCCSinPI[j1][k], r1 ) );
  886. limitTanV = AddSIMD( MulSIMD( sCCCosPI[j2][k], r0 ), MulSIMD( sCCSinPI[j2][k], r1 ) );
  887. }
  888. }
  889. else
  890. {
  891. // Corner vertices
  892. if ( valence == 2 )
  893. return;
  894. fltx4 *pEdgeStencil = sPosEdge1Stencil[ valence ];
  895. // Compute tangents
  896. fltx4 c0 = SubSIMD( pPos[ pOneRing[ 1 ] ], pPos[ pOneRing[ 0 ] ] );
  897. fltx4 c1 = SubSIMD( pPos[ pOneRing[ vtx1RingSize - 1 ] ], pPos[ pOneRing[ 0 ] ] );
  898. fltx4 e0 = MulSIMD( SubSIMD( pEdgeStencil[0], Four_Ones ), pPos[ pOneRing[ 0 ] ] );
  899. fltx4 e1 = e0;
  900. for ( int k = 1; k < 6; k++ )
  901. {
  902. e0 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ k ] ], e0 );
  903. e1 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ vtx1RingSize - 6 + k ] ], e1 );
  904. }
  905. // Compute average tangent plane normal
  906. fltx4 n0 = CrossProduct( c0, e0 );
  907. n0 = VectorNormalize( n0 );
  908. fltx4 n1 = CrossProduct( e1, c1 );
  909. n1 = VectorNormalize( n1 );
  910. fltx4 N = AddSIMD( n0, n1 );
  911. N = VectorNormalize( N );
  912. // Project into tangent plane
  913. fltx4 DotC0N = Dot3SIMD( c0, N );
  914. fltx4 DotC1N = Dot3SIMD( c1, N );
  915. c0 = SubSIMD( c0, MulSIMD( DotC0N, N ) );
  916. c1 = SubSIMD( c1, MulSIMD( DotC1N, N ) );
  917. fltx4 c0l = VectorLength( c0 );
  918. c0 = DivSIMD( c0, c0l );
  919. fltx4 c1l = VectorLength( c1 );
  920. c1 = DivSIMD( c1, c1l );
  921. fltx4 cAvg = MulSIMD( AddSIMD(c0l,c1l), Four_PointFives );
  922. // Compute angle
  923. fltx4 c0p = CrossProduct(N, c0);
  924. fltx4 dot1 = Dot3SIMD(c0p, c1);
  925. fltx4 dot2 = Dot3SIMD(c0, c1);
  926. float angle = PI - atan2( SubFloat( dot1, 0 ), -SubFloat( dot2, 0 ) );
  927. loopGapAngle = angle;
  928. // Compute final tangent vector
  929. int j1 = ( centerOffset - 1 ) / 2;
  930. int j2 = j1 + 1;
  931. int K = (valence - 1);
  932. static float fK[MAX_VALENCE] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
  933. 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
  934. 17.0f, 18.0f };
  935. // Compute final tangent vector
  936. float flK = fK[K];
  937. fltx4 Cos0 = ReplicateX4( cos( angle*j1 / flK ) );
  938. fltx4 Sin0 = ReplicateX4( sin( angle*j1 / flK ) );
  939. fltx4 Cos1 = ReplicateX4( cos( angle*j2 / flK ) );
  940. fltx4 Sin1 = ReplicateX4( sin( angle*j2 / flK ) );
  941. limitTanU = cAvg * ( Cos0 * c0 + Sin0 * c0p );
  942. limitTanV = cAvg * ( Cos1 * c0 + Sin1 * c0p );
  943. }
  944. // Flip tangents so they point in u/v direction
  945. if ( idx & 1 )
  946. {
  947. V_swap( limitTanU, limitTanV );
  948. }
  949. limitTanU = MulSIMD( limitTanU, tanUSign[idx] );
  950. limitTanV = MulSIMD( limitTanV, tanVSign[idx] );
  951. }
  952. FORCEINLINE void ComputeACCEdgePositions( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset,
  953. unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1,
  954. unsigned short cornerVtx0, unsigned short cornerVtx1,
  955. unsigned short edgeBias0, unsigned short edgeBias1,
  956. unsigned short val0, unsigned short val1,
  957. unsigned short minOneRingOffset, unsigned short vtx1RingSize,
  958. fltx4 &edgePos0, fltx4 &edgePos1)
  959. {
  960. VPROF_BUDGET( "ComputeACCEdgePositions (SIMD)", _T("SubD Rendering") );
  961. if ( bndVtx0 )
  962. {
  963. val0 = 2*(val0 - 1);
  964. }
  965. if ( bndVtx1 )
  966. {
  967. val1 = 2*(val1 - 1);
  968. }
  969. Assert( val0 <= MAX_VALENCE );
  970. Assert( val1 <= MAX_VALENCE );
  971. fltx4 *pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ];
  972. fltx4 *pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ];
  973. int kEnd = (bndEdge) ? 4 : 6;
  974. if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) )
  975. {
  976. int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 };
  977. for ( int i = 1; i < kEnd; i++ )
  978. {
  979. oneRingIndex[i] = centerOffset + i - 1;
  980. }
  981. edgePos0 = edgePos1 = Four_Zeros;
  982. for ( int k = 0; k < kEnd; k++ )
  983. {
  984. int idx = ( k + minOneRingOffset ) % kEnd; // Offset to min index to enforce evaluation order between neighboring patches
  985. edgePos0 = MaddSIMD( pStencil0[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos0 );
  986. edgePos1 = MaddSIMD( pStencil1[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos1 );
  987. }
  988. }
  989. else
  990. {
  991. fltx4 b0, b1;
  992. b1 = ReplicateX4( edgeBias0 / 32768.0f );
  993. b0 = SubSIMD( Four_Ones, b1 );
  994. edgePos0 = DivSIMD( ( Four_Valence[val0]*pPos[ oneRing[0] ] +
  995. Four_Twos*b0*pPos[ oneRing[ centerOffset] ] +
  996. b0*pPos[ oneRing[centerOffset + 1] ] +
  997. Four_Twos*pPos[ oneRing[centerOffset + 2] ] +
  998. b1*pPos[ oneRing[centerOffset + 3] ] +
  999. Four_Twos*b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] );
  1000. b1 = ReplicateX4( edgeBias1 / 32768.0f );
  1001. b0 = SubSIMD( Four_Ones, b1 );
  1002. edgePos1 = DivSIMD( ( Four_Twos*pPos[ oneRing[0] ] +
  1003. b0*pPos[ oneRing[centerOffset + 0] ] +
  1004. Four_Twos*b0*pPos[ oneRing[centerOffset + 1] ] +
  1005. Four_Valence[val1]*pPos[ oneRing[centerOffset + 2] ] +
  1006. Four_Twos*b1*pPos[ oneRing[centerOffset + 3] ] +
  1007. b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] );
  1008. }
  1009. }
  1010. FORCEINLINE void ComputeACCInteriorPosition( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, fltx4 &interiorPos )
  1011. {
  1012. VPROF_BUDGET( "ComputeACCInteriorPosition (SIMD)", _T( "SubD Rendering" ) );
  1013. if ( bndVtx )
  1014. {
  1015. valence = valence > 2 ? 2 * (valence - 1) : 4 * (valence - 1);
  1016. }
  1017. Assert( valence <= MAX_VALENCE );
  1018. fltx4 *pStencil = sPosInteriorStencil[ valence ];
  1019. interiorPos = MulSIMD( pStencil[0], pPos[ oneRing[0] ] );
  1020. for ( int k = 1; k < 4; k++ )
  1021. {
  1022. interiorPos = MaddSIMD( pStencil[k], pPos[ oneRing[ centerOffset + k - 1 ] ], interiorPos );
  1023. }
  1024. }
  1025. FORCEINLINE void ComputeACCGeometryPatchTangents( fltx4 *Pos, fltx4 *TanU, fltx4 *TanV )
  1026. {
  1027. //VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") );
  1028. TanU[0] = MulSIMD( Four_Threes, SubSIMD( Pos[1], Pos[0] ) );
  1029. TanV[0] = MulSIMD( Four_Threes, SubSIMD( Pos[4], Pos[0] ) );
  1030. TanU[3] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[4] ) );
  1031. TanV[1] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[1] ) );
  1032. TanU[6] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[8] ) );
  1033. TanV[2] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[2] ) );
  1034. TanU[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[12] ) );
  1035. TanV[3] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[3] ) );
  1036. TanU[1] = MulSIMD( Four_Threes, SubSIMD( Pos[2], Pos[1] ) );
  1037. TanV[4] = MulSIMD( Four_Threes, SubSIMD( Pos[8], Pos[4] ) );
  1038. TanU[4] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[5] ) );
  1039. TanV[5] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[5] ) );
  1040. TanU[7] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[9] ) );
  1041. TanV[6] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[6] ) );
  1042. TanU[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[13] ) );
  1043. TanV[7] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[7] ) );
  1044. TanU[2] = MulSIMD( Four_Threes, SubSIMD( Pos[3], Pos[2] ) );
  1045. TanV[8] = MulSIMD( Four_Threes, SubSIMD( Pos[12], Pos[8] ) );
  1046. TanU[5] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[6] ) );
  1047. TanV[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[9] ) );
  1048. TanU[8] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[10] ) );
  1049. TanV[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[10] ) );
  1050. TanU[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[14] ) );
  1051. TanV[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[11] ) );
  1052. }
  1053. void ComputeACCAllPatches( fltx4* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV, bool bRegularPatch )
  1054. {
  1055. VPROF_BUDGET( "ComputeACCAllPatches (SIMD)", _T( "SubD Rendering" ) );
  1056. int accCorner[] = { 0, 3, 15, 12 };
  1057. int accEdge1[] = { 4, 2, 11, 13 };
  1058. int accEdge2[] = { 8, 1, 7, 14 };
  1059. int accInterior[] = { 5, 6, 10, 9 };
  1060. int accTanCornerU[] = { 0, 2, 11, 9 }; // counterclockwise orders!
  1061. int accTanCornerV[] = { 0, 3, 11, 8 };
  1062. fltx4 OutPos[16], OutTanU[16], OutTanV[16];
  1063. // Point to four one-rings
  1064. int vtx1RingStart = 0;
  1065. unsigned short* pOneRing[4];
  1066. for ( int i = 0; i < 4; i++ )
  1067. {
  1068. unsigned short vtx1RingSize = quad->vtx1RingSize[i];
  1069. pOneRing[i] = &(quad->oneRing[vtx1RingStart]);
  1070. vtx1RingStart += vtx1RingSize;
  1071. }
  1072. {
  1073. VPROF_BUDGET( "ComputeACCAllPatches - Geometry Control Points (SIMD)", _T( "SubD Rendering" ) );
  1074. ComputeCatmullClarkLimitPosition( pPos, pOneRing[0], quad->vtx1RingSize[0], quad->minOneRingOffset[0], quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], OutPos[ accCorner[0] ] );
  1075. ComputeCatmullClarkLimitPosition( pPos, pOneRing[1], quad->vtx1RingSize[1], quad->minOneRingOffset[1], quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], OutPos[ accCorner[1] ] );
  1076. ComputeCatmullClarkLimitPosition( pPos, pOneRing[2], quad->vtx1RingSize[2], quad->minOneRingOffset[2], quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], OutPos[ accCorner[2] ] );
  1077. ComputeCatmullClarkLimitPosition( pPos, pOneRing[3], quad->vtx1RingSize[3], quad->minOneRingOffset[3], quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], OutPos[ accCorner[3] ] );
  1078. ComputeACCEdgePositions( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0],
  1079. quad->bndEdge[3], quad->bndVtx[0], quad->bndVtx[3],
  1080. quad->cornerVtx[0], quad->cornerVtx[3],
  1081. quad->edgeBias[6], quad->edgeBias[7],
  1082. quad->valences[0], quad->valences[3],
  1083. quad->minOneRingOffset[0], quad->vtx1RingSize[0],
  1084. OutPos[accEdge1[0]], OutPos[accEdge2[0]] );
  1085. ComputeACCEdgePositions( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1],
  1086. quad->bndEdge[0], quad->bndVtx[1], quad->bndVtx[0],
  1087. quad->cornerVtx[1], quad->cornerVtx[0],
  1088. quad->edgeBias[0], quad->edgeBias[1],
  1089. quad->valences[1], quad->valences[0],
  1090. quad->minOneRingOffset[1], quad->vtx1RingSize[1],
  1091. OutPos[accEdge1[1]], OutPos[accEdge2[1]] );
  1092. ComputeACCEdgePositions( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2],
  1093. quad->bndEdge[1], quad->bndVtx[2], quad->bndVtx[1],
  1094. quad->cornerVtx[2], quad->cornerVtx[1],
  1095. quad->edgeBias[2], quad->edgeBias[3],
  1096. quad->valences[2], quad->valences[1],
  1097. quad->minOneRingOffset[2], quad->vtx1RingSize[2],
  1098. OutPos[accEdge1[2]], OutPos[accEdge2[2]] );
  1099. ComputeACCEdgePositions( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3],
  1100. quad->bndEdge[2], quad->bndVtx[3], quad->bndVtx[2],
  1101. quad->cornerVtx[3], quad->cornerVtx[2],
  1102. quad->edgeBias[4], quad->edgeBias[5],
  1103. quad->valences[3], quad->valences[2],
  1104. quad->minOneRingOffset[3], quad->vtx1RingSize[3],
  1105. OutPos[accEdge1[3]], OutPos[accEdge2[3]] );
  1106. ComputeACCInteriorPosition( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0], quad->bndVtx[0], quad->valences[0], OutPos[ accInterior[0] ] );
  1107. ComputeACCInteriorPosition( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1], quad->bndVtx[1], quad->valences[1], OutPos[ accInterior[1] ] );
  1108. ComputeACCInteriorPosition( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2], quad->bndVtx[2], quad->valences[2], OutPos[ accInterior[2] ] );
  1109. ComputeACCInteriorPosition( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3], quad->bndVtx[3], quad->valences[3], OutPos[ accInterior[3] ] );
  1110. }
  1111. #if !defined( NO_TANGENTS )
  1112. // Don't compute tangents for regular patches
  1113. #if defined( SEPARATE_REGULAR_AND_EXTRA )
  1114. if ( !bRegularPatch )
  1115. #endif
  1116. {
  1117. VPROF_BUDGET( "ComputeACCAllPatches - Tangents (SIMD)", _T( "SubD Rendering" ) );
  1118. ComputeACCGeometryPatchTangents( OutPos, OutTanU, OutTanV );
  1119. float flLoopGap[4];
  1120. flLoopGap[0] = ( M_PI2 * quad->loopGapAngle[0] ) / 65535.0f;
  1121. flLoopGap[1] = ( M_PI2 * quad->loopGapAngle[1] ) / 65535.0f;
  1122. flLoopGap[2] = ( M_PI2 * quad->loopGapAngle[2] ) / 65535.0f;
  1123. flLoopGap[3] = ( M_PI2 * quad->loopGapAngle[3] ) / 65535.0f;
  1124. if ( !sShowACCGeometryTangents )
  1125. {
  1126. {
  1127. ComputeCatmullClarkLimitTangents( 0, pPos, pOneRing[0], quad->vtx1RingSize[0], quad->vtx1RingCenterQuadOffset[0],
  1128. quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], flLoopGap[0], OutTanU[ accTanCornerU[0] ], OutTanV[ accTanCornerV[0] ] );
  1129. ComputeCatmullClarkLimitTangents( 1, pPos, pOneRing[1], quad->vtx1RingSize[1], quad->vtx1RingCenterQuadOffset[1],
  1130. quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], flLoopGap[1], OutTanU[ accTanCornerU[1] ], OutTanV[ accTanCornerV[1] ] );
  1131. ComputeCatmullClarkLimitTangents( 2, pPos, pOneRing[2], quad->vtx1RingSize[2], quad->vtx1RingCenterQuadOffset[2],
  1132. quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], flLoopGap[2], OutTanU[ accTanCornerU[2] ], OutTanV[ accTanCornerV[2] ] );
  1133. ComputeCatmullClarkLimitTangents( 3, pPos, pOneRing[3], quad->vtx1RingSize[3], quad->vtx1RingCenterQuadOffset[3],
  1134. quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], flLoopGap[3], OutTanU[ accTanCornerU[3] ], OutTanV[ accTanCornerV[3] ] );
  1135. }
  1136. // compute correction component to boundary tangents for tangent plane continuity
  1137. // /TanV/ /TanU/ / TanV / /TanU/
  1138. static int CB_CornerIdx[] = {0,1,2, 3,7,11, 11,10,9, 8,4,0 };
  1139. static int CB_InteriorIdx[] = {1,2, 5,8, 10,9, 6,3 };
  1140. static fltx4 CB_sign[4] = {Four_Ones,Four_NegativeOnes,Four_Ones,Four_NegativeOnes};
  1141. {
  1142. // Unroll, since the compiler wants to keep it rolled, and we get better perf unrolled
  1143. {
  1144. fltx4 u00 = OutTanU[CB_CornerIdx[0]];
  1145. fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[1]], Four_Twos );
  1146. fltx4 u20 = OutTanU[CB_CornerIdx[2]];
  1147. int val0 = quad->valences[0]; int val1 = quad->valences[1];
  1148. if ( quad->bndVtx[0] ) val0--;
  1149. if ( quad->bndVtx[1] ) val1--;
  1150. fltx4 c0 = ReplicateX4( cosf( (flLoopGap[0]) / val0 ) );
  1151. fltx4 c1 = ReplicateX4( cosf( (flLoopGap[1]) / val1 ) );
  1152. fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
  1153. fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );
  1154. OutTanV[CB_InteriorIdx[0] ] = AddSIMD( OutTanV[CB_InteriorIdx[0] ], E );
  1155. OutTanV[CB_InteriorIdx[1] ] = AddSIMD( OutTanV[CB_InteriorIdx[1] ], F );
  1156. }
  1157. {
  1158. fltx4 u00 = OutTanV[CB_CornerIdx[3]];
  1159. fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[4]], Four_Twos );
  1160. fltx4 u20 = OutTanV[CB_CornerIdx[5]];
  1161. int val0 = quad->valences[1]; int val1 = quad->valences[2];
  1162. if ( quad->bndVtx[1] ) val0--;
  1163. if ( quad->bndVtx[2] ) val1--;
  1164. fltx4 c0 = ReplicateX4( cosf( (flLoopGap[1]) / val0 ) );
  1165. fltx4 c1 = ReplicateX4( cosf( (flLoopGap[2]) / val1 ) );
  1166. fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
  1167. fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );
  1168. OutTanU[CB_InteriorIdx[2] ] = SubSIMD( OutTanU[CB_InteriorIdx[2] ], E );
  1169. OutTanU[CB_InteriorIdx[3] ] = SubSIMD( OutTanU[CB_InteriorIdx[3] ], F );
  1170. }
  1171. {
  1172. fltx4 u00 = OutTanU[CB_CornerIdx[6]];
  1173. fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[7]], Four_Twos );
  1174. fltx4 u20 = OutTanU[CB_CornerIdx[8]];
  1175. int val0 = quad->valences[2]; int val1 = quad->valences[3];
  1176. if ( quad->bndVtx[2] ) val0--;
  1177. if ( quad->bndVtx[3] ) val1--;
  1178. fltx4 c0 = ReplicateX4( cosf( (flLoopGap[2]) / val0 ) );
  1179. fltx4 c1 = ReplicateX4( cosf( (flLoopGap[3]) / val1 ) );
  1180. fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
  1181. fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );
  1182. OutTanV[CB_InteriorIdx[4] ] = AddSIMD( OutTanV[CB_InteriorIdx[4] ], E );
  1183. OutTanV[CB_InteriorIdx[5] ] = AddSIMD( OutTanV[CB_InteriorIdx[5] ], F );
  1184. }
  1185. {
  1186. fltx4 u00 = OutTanV[CB_CornerIdx[9]];
  1187. fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[10]], Four_Twos );
  1188. fltx4 u20 = OutTanV[CB_CornerIdx[11]];
  1189. int val0 = quad->valences[3]; int val1 = quad->valences[0];
  1190. if ( quad->bndVtx[3] ) val0--;
  1191. if ( quad->bndVtx[0] ) val1--;
  1192. fltx4 c0 = ReplicateX4( cosf( (flLoopGap[3]) / val0 ) );
  1193. fltx4 c1 = ReplicateX4( cosf( (flLoopGap[0]) / val1 ) );
  1194. fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
  1195. fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );
  1196. OutTanU[CB_InteriorIdx[6] ] = SubSIMD( OutTanU[CB_InteriorIdx[6] ], E );
  1197. OutTanU[CB_InteriorIdx[7] ] = SubSIMD( OutTanU[CB_InteriorIdx[7] ], F );
  1198. }
  1199. }
  1200. }
  1201. StoreAlignedSIMD( (float*)&TanU[0], OutTanU[0] );
  1202. StoreAlignedSIMD( (float*)&TanU[1], OutTanU[1] );
  1203. StoreAlignedSIMD( (float*)&TanU[2], OutTanU[2] );
  1204. StoreAlignedSIMD( (float*)&TanU[3], OutTanU[3] );
  1205. StoreAlignedSIMD( (float*)&TanU[4], OutTanU[4] );
  1206. StoreAlignedSIMD( (float*)&TanU[5], OutTanU[5] );
  1207. StoreAlignedSIMD( (float*)&TanU[6], OutTanU[6] );
  1208. StoreAlignedSIMD( (float*)&TanU[7], OutTanU[7] );
  1209. StoreAlignedSIMD( (float*)&TanU[8], OutTanU[8] );
  1210. StoreAlignedSIMD( (float*)&TanU[9], OutTanU[9] );
  1211. StoreAlignedSIMD( (float*)&TanU[10], OutTanU[10] );
  1212. StoreAlignedSIMD( (float*)&TanU[11], OutTanU[11] );
  1213. StoreAlignedSIMD( (float*)&TanV[0], OutTanV[0] );
  1214. StoreAlignedSIMD( (float*)&TanV[1], OutTanV[1] );
  1215. StoreAlignedSIMD( (float*)&TanV[2], OutTanV[2] );
  1216. StoreAlignedSIMD( (float*)&TanV[3], OutTanV[3] );
  1217. StoreAlignedSIMD( (float*)&TanV[4], OutTanV[4] );
  1218. StoreAlignedSIMD( (float*)&TanV[5], OutTanV[5] );
  1219. StoreAlignedSIMD( (float*)&TanV[6], OutTanV[6] );
  1220. StoreAlignedSIMD( (float*)&TanV[7], OutTanV[7] );
  1221. StoreAlignedSIMD( (float*)&TanV[8], OutTanV[8] );
  1222. StoreAlignedSIMD( (float*)&TanV[9], OutTanV[9] );
  1223. StoreAlignedSIMD( (float*)&TanV[10], OutTanV[10] );
  1224. StoreAlignedSIMD( (float*)&TanV[11], OutTanV[11] );
  1225. }
  1226. #endif
  1227. StoreAlignedSIMD( (float*)&Pos[0], OutPos[0] );
  1228. StoreAlignedSIMD( (float*)&Pos[1], OutPos[1] );
  1229. StoreAlignedSIMD( (float*)&Pos[2], OutPos[2] );
  1230. StoreAlignedSIMD( (float*)&Pos[3], OutPos[3] );
  1231. StoreAlignedSIMD( (float*)&Pos[4], OutPos[4] );
  1232. StoreAlignedSIMD( (float*)&Pos[5], OutPos[5] );
  1233. StoreAlignedSIMD( (float*)&Pos[6], OutPos[6] );
  1234. StoreAlignedSIMD( (float*)&Pos[7], OutPos[7] );
  1235. StoreAlignedSIMD( (float*)&Pos[8], OutPos[8] );
  1236. StoreAlignedSIMD( (float*)&Pos[9], OutPos[9] );
  1237. StoreAlignedSIMD( (float*)&Pos[10], OutPos[10] );
  1238. StoreAlignedSIMD( (float*)&Pos[11], OutPos[11] );
  1239. StoreAlignedSIMD( (float*)&Pos[12], OutPos[12] );
  1240. StoreAlignedSIMD( (float*)&Pos[13], OutPos[13] );
  1241. StoreAlignedSIMD( (float*)&Pos[14], OutPos[14] );
  1242. StoreAlignedSIMD( (float*)&Pos[15], OutPos[15] );
  1243. }
  1244. #endif