csgo/cstrike15_src/studiorender/r_studiosubd_patches.cpp


								#include "r_studiosubd_patches.h"

								#include "tier1/convar.h"

								#include <stdio.h>


								#define PI 3.14159265


								#ifdef _DEBUG

								CUtlVector<Vector4D> g_DebugCornerPositions;

								CUtlVector<Vector4D> g_DebugEdgePositions;

								CUtlVector<Vector4D> g_DebugInteriorPositions;

								#endif


								//----------------------------------------------------------------------------------------------

								// static stencil buffers

								//----------------------------------------------------------------------------------------------


								#if !defined( USE_OPT )


								static float sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2];

								static float sPosEdge1Stencil[MAX_VALENCE+1][6];

								static float sPosEdge2Stencil[MAX_VALENCE+1][6];

								static float sPosInteriorStencil[MAX_VALENCE+1][4];


								static float sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static float sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static float sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static float sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static float sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static float sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];


								static float sPosGregoryInterior1Stencil[6];

								static float sPosGregoryInterior2Stencil[6];


								static float sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static float sPosEdge1BndStencil[MAX_VALENCE+1][6];

								static float sPosEdge2BndStencil[MAX_VALENCE+1][6];

								static float sPosInteriorBndStencil[MAX_VALENCE+1][4];


								static float sPosEdge1CornerStencil[MAX_VALENCE+1][6];

								static float sPosEdge2CornerStencil[MAX_VALENCE+1][6];


								#endif


								static bool sTableInited = false;

								static bool sCornerCorrection = false;

								static bool sShowACCGeometryTangents = false;

								static bool sUseCornerTangents = true;


								void set_ShowACCGeometryTangents(bool v)

								{

									sShowACCGeometryTangents = v;

								}


								void set_CornerCorrection(bool v)

								{

									sCornerCorrection = v;

								}


								void set_UseCornerTangents(bool v)

								{

									sUseCornerTangents = v;

								}


								// averaging function over geometry patch tangents.

								static float tangentAveraging( int n, int j)

								{

									return sin( PI * j / (float) n );

								}


								//--------------------------------------------------------------------------------------

								// Subdiv Stencils

								//--------------------------------------------------------------------------------------

								#if !defined( USE_OPT )


								static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, float *stencilBuffer)

								{

									VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") );


									memset(stencilBuffer, 0, 2*n*sizeof(float));


									if (!boundary)

									{

										float scale = 1.0f / (n*n + 5.0f*n);


										stencilBuffer[0] = n*n * scale;


										for (int i=0; i<n; i++)

										{

											stencilBuffer[2*i+1] = 4.0f * scale;

											stencilBuffer[2*i+2] = 1.0f * scale;

										}

									}

									else

									{

										int k = n-1;


										float s = 1.0f / 6.0f;

										stencilBuffer[0]     = s * 4.0f;

										stencilBuffer[1]     = s * 1.0f;

										stencilBuffer[2*k+1] = s * 1.0f;

									}

								}


								static void ComputeCatmullClarkLimitTanStencil(bool bndVtx, bool cornerVtx, const int n, float *stencilBuffer1, float *stencilBuffer2)

								{

									VPROF_BUDGET( "ComputeCatmullClarkLimitTanStencil", _T("SubD Rendering") );


									memset( stencilBuffer1, 0, sizeof(float) * 2*n );

									memset( stencilBuffer2, 0, sizeof(float) * 2*n );


									if ( !bndVtx )

									{

										float scale_beta  = 1.0f / (n * sqrtf( 4.0f + cos( PI / n ) * cos( PI / n ) ) );

										float scale_alpha = 1.0f / n + cos( PI / n ) * scale_beta;


										for ( int i=0; i<n; i++ )

										{

											stencilBuffer1[2*i+1] = cos( 2*PI*i/n ) * scale_alpha;

											stencilBuffer1[2*i+2] = cos((2*PI*i+PI)/n ) * scale_beta;


											int j = (i - 1)%n;

											stencilBuffer2[2*i+1] = cos( 2*PI*j/n ) * scale_alpha;

											stencilBuffer2[2*i+2] = cos((2*PI*j+PI)/n ) * scale_beta;

										}

									}

									else

									{

										// boundary vertex cases


										if ( cornerVtx )

										{

											if ( n<=2 )

												return;


											float sectorScale = 0, w;

											// treat first and last tangent (crease edges) separately

											w = tangentAveraging( n-1, 0 ); sectorScale += w;

											stencilBuffer1[ 1] +=  0.5 * w;

											stencilBuffer1[ 0] += -0.5 * w;


											w = tangentAveraging( n-1, n-1 ); sectorScale += w;

											stencilBuffer1[ 2*(n-1)+1] +=  0.5 * w;

											stencilBuffer1[ 0 ]        += -0.5 * w;


											// inner tangents are computed using the 6 weights from the geometery edge construction.

											for (int k=1; k<(n-1); k++)

											{

												w = tangentAveraging( n-1, k ); sectorScale += w;

												float scale = 1.0f / (2.0f*n + 10.0f);


												stencilBuffer1[        0] += w * (2.0f*n * scale - 1.0f);

												stencilBuffer1[2*(k-1)+1] += w *  2.0f   * scale;

												stencilBuffer1[2*(k-1)+2] += w *  1.0f   * scale;

												stencilBuffer1[2*(k-1)+3] += w *  4.0f   * scale;

												stencilBuffer1[2*(k-1)+4] += w *  1.0f   * scale;

												stencilBuffer1[2*(k-1)+5] += w *  2.0f   * scale;

											}


											// rescale weights

											for (int k = 0; k<2*n; k++)

											{

												stencilBuffer1[k] /= sectorScale;

											}


										}

										else

										{

											// special case to avoid colinear tangents

											if ( n==2 )

											{

												float s = 1.0f / 2.0f;

												stencilBuffer1[1] = 1.0 * s;

												stencilBuffer1[3] =-1.0 * s;


												stencilBuffer2[1] =-1.0 * s;

												stencilBuffer2[3] = 1.0 * s;


												// regularization term to avoid collinearity and preserve limit normal at the boundary

												float eps = 1e-4;

												stencilBuffer1[0] += eps * (-4.0/3.0);

												stencilBuffer1[1] += eps * (1.0/2.0);

												stencilBuffer1[2] += eps * (1.0/3.0);

												stencilBuffer1[3] += eps * (1.0/2.0);


												stencilBuffer2[0] += eps * (-4.0/3.0);

												stencilBuffer2[1] += eps * (1.0/2.0);

												stencilBuffer2[2] += eps * (1.0/3.0);

												stencilBuffer2[3] += eps * (1.0/2.0);


											}

											else

											{

												int k = n-1;

												float c = cos( PI / k ), s=sin( PI / k );


												stencilBuffer1[2*0+1] =  0.5f;

												stencilBuffer1[2*k+1] = -0.5f;


												stencilBuffer2[0] = -4.0f*s / (3.0f*k + c);  // gamma


												for (int i=0; i<k; ++i)

												{

													stencilBuffer2[2*i+1] = 4*sin(PI*i/k)/(3*k+c);                   // alpha_i

													stencilBuffer2[2*i+2] = (sin(PI*i/k)+sin(PI*(i+1)/k)) / (3.0f*k+c);   // beta_i

												}


												stencilBuffer2[2*0+1] = stencilBuffer2[2*k+1] = -( (1+2*c)*sqrt(1+c) ) / ( (3*k+c)*sqrt(1-c) );  // alpha_0, alpha_k

											}

										}


									}


								}


								static void computeACCEdgePosStencils(byte boundary, byte corner, int n, float *stencilBuffer1, float *stencilBuffer2)

								{

									VPROF_BUDGET( "ComputeACCEdgePosStencils", _T("SubD Rendering") );


									memset(stencilBuffer1, 0, 6*sizeof(float));

									memset(stencilBuffer2, 0, 6*sizeof(float));


									if ( !boundary )

									{

										float scale = 1.0f / (2.0f*n + 10.0f);


										stencilBuffer1[0] = 2.0f*n * scale; stencilBuffer2[0] = 4.0f * scale;

										stencilBuffer1[1] = 2.0f * scale;   stencilBuffer2[1] = 1.0f * scale;

										stencilBuffer1[2] = 1.0f * scale;   stencilBuffer2[2] = 2.0f * scale;

										stencilBuffer1[3] = 4.0f * scale;   stencilBuffer2[3] = 2.0f*n* scale;

										stencilBuffer1[4] = 1.0f * scale;   stencilBuffer2[4] = 2.0f * scale;

										stencilBuffer1[5] = 2.0f * scale;   stencilBuffer2[5] = 1.0f * scale;

									}

									else

									{ // boundary stencil

										if ( corner )

										{

											float scale = 1.0f / (3.0f);


											stencilBuffer1[0] = 2.0f * scale; stencilBuffer2[0] = 1.0f * scale;

											stencilBuffer1[3] = 1.0f * scale; stencilBuffer2[3] = 2.0f * scale;

										}

										else

										{

											float scale = 1.0f / 3.0f;


											stencilBuffer1[0] = 2.0f * scale; stencilBuffer2[0] = 1.0f * scale;

											stencilBuffer1[3] = 1.0f * scale; stencilBuffer2[3] = 2.0f * scale;

										}

									}

								}


								static void computeACCInteriorPosStencil(byte boundary, int n, float *stencilBuffer)

								{

									VPROF_BUDGET( "ComputeACCInteriorPosStencil", _T("SubD Rendering") );


									float scale = 1.0f / (n + 5.0f);


									stencilBuffer[0] = n * scale;

									stencilBuffer[1] = 2.0f * scale;

									stencilBuffer[2] = 1.0f * scale;

									stencilBuffer[3] = 2.0f * scale;

								}


								void FillTables()

								{

									if ( sTableInited )	return;


									for ( int val=0; val<=MAX_VALENCE; val++ )

									{

										// interior stencils

										computeCatmullClarkLimitPosStencil(false, val, sPosCornerStencil[val]);

										computeACCEdgePosStencils(false, false, val, sPosEdge1Stencil[val], sPosEdge2Stencil[val]);

										computeACCInteriorPosStencil(false, val, sPosInteriorStencil[val]);


										// boundary stencils

										computeCatmullClarkLimitPosStencil(true, val, sPosCornerBndStencil[val]);

										computeACCEdgePosStencils(true, false, val, sPosEdge1BndStencil[val], sPosEdge2BndStencil[val]);

										computeACCEdgePosStencils(true, true, val, sPosEdge1CornerStencil[val], sPosEdge2CornerStencil[val]);

										computeACCInteriorPosStencil(true, val, sPosInteriorBndStencil[val]);


										computeCatmullClarkLimitTanStencil(false, false, val, sCCLimitTanStencil1[val], sCCLimitTanStencil2[val]);

										computeCatmullClarkLimitTanStencil(true, false, val, sCCLimitTanBndStencil1[val], sCCLimitTanBndStencil2[val]);

										computeCatmullClarkLimitTanStencil(true, true, val, sCCLimitTanCornerStencil1[val], sCCLimitTanCornerStencil2[val]);

									}


									sTableInited = true;

								}


								//--------------------------------------------------------------------------------------

								// Runtime

								//--------------------------------------------------------------------------------------


								#ifdef _DEBUG

								static ConVar mat_tess_dump( "mat_tess_dump", "0", FCVAR_CHEAT );

								#endif


								// Compute corner control points for each patch

								inline void ComputeCatmullClarkLimitPosition( Vector4D *pPos, unsigned short *oneRing,

																			 unsigned short vtx1RingSize, unsigned short minOneRingIndex, unsigned short bndVtx,

																			 unsigned short cornerVtx, unsigned short valence, unsigned short nbCorners, Vector4D &limitPos )

								{

									VPROF_BUDGET( "ComputeCatmullClarkLimitPosition", _T("SubD Rendering") );


									if ( cornerVtx > 0 )

									{

										limitPos = pPos[ oneRing[0] ];

									}

									else

									{

										assert( valence <= MAX_VALENCE );


										float *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ];


										// pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer)

										limitPos = pStencil[0] * pPos[ oneRing[0] ];

										for ( int k = 0; k < vtx1RingSize; k++ )

										{

											int idx = ( k + minOneRingIndex ) % vtx1RingSize;	// Shuffle to get the minimum index consistently first in order

											if ( idx != 0 )										// Don't do pStencil[0] again

											{

												limitPos += pStencil[idx] * pPos[ oneRing[idx] ];

											}

										}

									}

								#ifdef _DEBUG

									g_DebugCornerPositions.AddToTail( limitPos );

								#endif

								}


								inline Vector4D CrossProduct(const Vector4D& a, const Vector4D& b)

								{

									return Vector4D( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f );

								}


								inline float VectorNormalize(Vector4D& vec)

								{

									float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);


									// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.

									float iradius = 1.f / ( radius + FLT_EPSILON );


									vec.x *= iradius;

									vec.y *= iradius;

									vec.z *= iradius;


									return radius;

								}


								FORCEINLINE float DotProduct(const Vector4D& a, const Vector4D& b)

								{

									return ( a.x*b.x + a.y*b.y + a.z*b.z );

								}


								inline void ComputeCatmullClarkLimitTangents( int idx, Vector4D *pPos, unsigned short *oneRing, unsigned short vtx1RingSize,

																			 unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx,

																			 unsigned short valence, unsigned short &loopGapAngle,

																			 Vector4D &limitTanU, Vector4D &limitTanV )

								{

									// for valence=1, no need to have separate tangents


									float tanUSign[] = {1,-1,-1,1};

									float tanVSign[] = {1,1,-1,-1};


									VPROF_BUDGET( "ComputeCatmullClarkLimitTangents", _T("SubD Rendering") );


									if ( !sUseCornerTangents )

										cornerVtx = 0;


									if ( !bndVtx )			// interior vertices

									{

										float *stencil1 = sCCLimitTanStencil1[ valence ];

										float *stencil2 = sCCLimitTanStencil2[ valence ];


										limitTanU = Vector4D(0,0,0,0);

										limitTanV = Vector4D(0,0,0,0);


										for (int k = 0; k < vtx1RingSize; ++k)

										{

											limitTanU += stencil1[k] * pPos[ oneRing[k] ];

											limitTanV += stencil2[k] * pPos[ oneRing[k] ];

										}


									}

									else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) )	// smooth boundary vertices

									{


										float *stencil1 = sCCLimitTanBndStencil1[ valence ];

										float *stencil2 = sCCLimitTanBndStencil2[ valence ];


										Vector4D r0 = Vector4D(0,0,0,0);

										Vector4D r1 = Vector4D(0,0,0,0);


										for (int k = 0; k < vtx1RingSize; ++k)

										{

											r0 += stencil1[k] * pPos[ oneRing[k] ];

											r1 += stencil2[k] * pPos[ oneRing[k] ];

										}


										int j1 = (centerOffset - 1) / 2;

										int j2 = j1+1;

										int K = (valence - 1);


										if (valence == 2)

										{

											limitTanU = r0;

											limitTanV = r1;

										}

										else

										{

											limitTanU = cos(PI*j1 / K) * r0 + sin(PI*j1 / K) * r1;

											limitTanV = cos(PI*j2 / K) * r0 + sin(PI*j2 / K) * r1;

										}

									}

									else // corner vertices

									{

										if ( valence == 2 )

											return;


										float *pEdgeStencil = sPosEdge1Stencil[ valence ];

								//		float *avgStencil  = sCCLimitTanCornerStencil1[ valence ];


										// compute tangents

										Vector4D c0 = pPos[ oneRing[1] ] - pPos[ oneRing[0] ];  c0.w = 0;

										Vector4D c1 = pPos[ oneRing[vtx1RingSize - 1] ] - pPos[ oneRing[0] ]; c1.w = 0;


										Vector4D e0 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ];

										Vector4D e1 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ];

										for (int k = 1; k < 6; k++ )

										{

											e0 += pEdgeStencil[k] * pPos[ oneRing[ k ] ];

											e1 += pEdgeStencil[k] * pPos[ oneRing[ vtx1RingSize - 6 + k ] ];

										}

										e0.w = 0; e1.w = 0;


										// compute average tangent plane normal

										Vector4D n0 = CrossProduct( c0, e0 ); VectorNormalize( n0 );

										Vector4D n1 = CrossProduct( e1, c1 ); VectorNormalize( n1 );

										Vector4D N = n0 + n1;

								//		N = N - ( DotProduct( N, tAvg )/ DotProduct(tAvg, tAvg) ) * tAvg;

										VectorNormalize( N );


										// project into tangent plane


										c0 = c0 - DotProduct(c0, N) * N;

										c1 = c1 - DotProduct(c1, N) * N;


										float c0l = Vector4DLength( c0 ); c0 = c0 / c0l;

										float c1l = Vector4DLength( c1 ); c1 = c1 / c1l;

										float cAvg = (c0l + c1l) / 2;


										// compute angle

										Vector4D c0p = CrossProduct(N, c0);

										float angle = PI - atan2( DotProduct(c0p, c1), -DotProduct(c0, c1) );


										loopGapAngle = (unsigned int) ((65535.0 * angle) / (2*PI));


										// compute final tangent vector

										int j1 = (centerOffset - 1) / 2;

										int j2 = j1+1;

										int K = (valence - 1);


										limitTanU = cAvg * ( cos(angle*j1 / K) * c0 + sin(angle*j1 / K) * c0p );

										limitTanV = cAvg * ( cos(angle*j2 / K) * c0 + sin(angle*j2 / K) * c0p );

									}


									// flip tangents so they point in u/v direction

									if ( idx & 1 )

									{

										swap(limitTanU, limitTanV);

									}

									limitTanU *= tanUSign[idx];

									limitTanV *= tanVSign[idx];

								}


								inline void ComputeACCEdgePositions( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset,

																	unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1,

																	unsigned short cornerVtx0, unsigned short cornerVtx1, unsigned short loopGapAngle0, unsigned short loopGapAngle1,

																	unsigned short edgeBias0, unsigned short edgeBias1, unsigned short val0, unsigned short val1,

																	unsigned short minOneRingOffset, unsigned short vtx1RingSize,

																	Vector4D &edgePos0, Vector4D &edgePos1)

								{

									VPROF_BUDGET( "ComputeACCEdgePositions", _T("SubD Rendering") );


									if ( bndVtx0 )

									{

										val0 = 2*(val0 - 1);

									}


									if ( bndVtx1 )

									{

										val1 = 2*(val1 - 1);

									}


									Assert( val0 <= MAX_VALENCE );

									Assert( val1 <= MAX_VALENCE );


									float* pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ];

									float* pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ];


									int kEnd = (bndEdge) ? 4 : 6;


									if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) )

									{

										int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 };

										for ( int i = 1; i < kEnd; i++ )

										{

											oneRingIndex[i] = centerOffset + i - 1;

										}


										edgePos0 = edgePos1 = Vector4D(0,0,0,0);

										for ( int k = 0; k < kEnd; k++ )

										{

											int idx = ( k + minOneRingOffset ) % kEnd;	// Offset to min index to enforce evaluation order between neighboring patches

											edgePos0 += pStencil0[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ];

											edgePos1 += pStencil1[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ];

										}

									}

									else

									{

										float b0, b1;

										b1 = edgeBias0 / 32768.0, b0 = 1.0f-b1;

										edgePos0 = (val0 * pPos[ oneRing[0] ] + 2*b0*pPos[ oneRing[centerOffset + 0] ] + 1*b0*pPos[ oneRing[centerOffset + 1] ] +    2*pPos[ oneRing[centerOffset + 2] ] + 1*b1*pPos[ oneRing[centerOffset + 3] ] + 2*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val0 + 5.0f);

										b1 = edgeBias1 / 32768.0, b0 = 1.0f-b1;

										edgePos1 = (   2 * pPos[ oneRing[0] ] + 1*b0*pPos[ oneRing[centerOffset + 0] ] + 2*b0*pPos[ oneRing[centerOffset + 1] ] + val1*pPos[ oneRing[centerOffset + 2] ] + 2*b1*pPos[ oneRing[centerOffset + 3] ] + 1*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val1 + 5.0f);

									}


								#ifdef _DEBUG

									g_DebugEdgePositions.AddToTail( edgePos0 );

									g_DebugEdgePositions.AddToTail( edgePos1 );

								#endif

								}


								inline void ComputeACCInteriorPosition( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, Vector4D &interiorPos )

								{

									VPROF_BUDGET( "ComputeACCInteriorPosition", _T("SubD Rendering") );


									if ( bndVtx )

									{

										valence = valence>2 ?  2*(valence - 1) : 4*(valence - 1);

									}


									Assert( valence<=MAX_VALENCE );


									float *stencil = sPosInteriorStencil[ valence ];


									interiorPos = stencil[0] * pPos[ oneRing[0] ];

									for ( int k = 1; k < 4; ++k )

									{

										interiorPos += stencil[k] * pPos[ oneRing[ centerOffset + k - 1 ] ];

									}


								#ifdef _DEBUG

									g_DebugInteriorPositions.AddToTail( interiorPos );

								#endif


								}


								inline void ComputeACCGeometryPatchTangents( Vector4D *Pos, Vector4D *TanU, Vector4D *TanV )

								{

									VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") );


									for ( int j=0; j<3; j++ )

									{

										for ( int i=0; i<4; i++ )

										{

											TanU[i*3+j] = 3*( Pos[i*4+j+1]   - Pos[i*4+j] );

											TanV[j*4+i] = 3*( Pos[(j+1)*4+i] - Pos[j*4+i] );

										}

									}

								}


								void ComputeACCGeometryPatch( Vector4D* pPos, TopologyIndexStruct *quad, Vector4D* Pos)

								{

									VPROF_BUDGET( "ComputeACCGeometryPatch", _T("SubD Rendering") );


									int MOD4[8] = {0,1,2,3,0,1,2,3};


									int accCorner[]   = {0,3,15,12};

									int accEdge1[]    = {4,2,11,13};

									int accEdge2[]    = {8,1,7,14};

									int accInterior[] = {5,6,10,9};


									int vtx1RingStart = 0;


									unsigned short *oneRing = quad->oneRing;


									for ( int i=0; i<4; i++ ) // 4 corner vertices

									{

										ComputeCatmullClarkLimitPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingSize[i], quad->minOneRingOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->nbCornerVtx[i], Pos[ accCorner[i] ] );


										ComputeACCEdgePositions( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i],

											quad->bndEdge[ MOD4[i+3] ],

											quad->bndVtx[i], quad->bndVtx[MOD4[i+3]],

											quad->cornerVtx[i],    quad->cornerVtx[MOD4[i+3]],

											quad->loopGapAngle[i], quad->loopGapAngle[MOD4[i+3]],

											quad->edgeBias[ 2*MOD4[i+3] ], quad->edgeBias[ 2*MOD4[i+3] + 1 ],

											quad->valences[i], quad->valences[MOD4[i+3]],

											quad->minOneRingOffset[i], quad->vtx1RingSize[i],

											Pos[accEdge1[i]], Pos[accEdge2[i]] );


										ComputeACCInteriorPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->loopGapAngle[i], quad->valences[i], Pos[ accInterior[i] ] );


										vtx1RingStart += quad->vtx1RingSize[i];

									}

								}


								void ComputeACCTangentPatches( Vector4D* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV )

								{

									VPROF_BUDGET( "ComputeACCTangentPatches", _T("SubD Rendering") );


									int MOD4[8] = {0,1,2,3,0,1,2,3};


									int accTanCornerU[] = {0,2,11,9};  // counterclockwise orders!

									int accTanCornerV[] = {0,3,11,8};


									unsigned short *oneRing = quad->oneRing;


									ComputeACCGeometryPatchTangents(Pos, TanU, TanV);


								#if !defined( NO_TANGENTS )

									if ( !sShowACCGeometryTangents )

									{

										// compute corner tangents ( = subdivision surface limit tangents)

										int vtx1RingStart = 0;

										for ( int i=0; i<4; i++ )

										{

											int vtx1RingSize = quad->vtx1RingSize[i];


											Vector4D &accTanU = TanU[ accTanCornerU[i] ];

											Vector4D &accTanV = TanV[ accTanCornerV[i] ];


											ComputeCatmullClarkLimitTangents(i, pPos, &oneRing[vtx1RingStart], vtx1RingSize, quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->loopGapAngle[i], accTanU, accTanV );


											vtx1RingStart += vtx1RingSize;

										}


										// compute correction component to boundary tangents for tangent plane continuity

										//                             /TanV/ /TanU/ / TanV / /TanU/

										static int   CB_CornerIdx[]   = {0,1,2, 3,7,11, 11,10,9, 8,4,0 };

										static int   CB_InteriorIdx[] = {1,2,   5,8,    10,9,    6,3 };

										static float CB_sign[]        = {1,-1,1,-1};


										for ( int i=0; i<4; i++ ) // for all quad edges

										{

											if ( !quad->bndEdge[i] )

											{

												Vector4D *CBTanV = (i&1) ? TanU : TanV;

												Vector4D *CBTanU = (i&1) ? TanV : TanU;


												Vector4D u00 = CBTanU[CB_CornerIdx[3*i + 0]];

												Vector4D u10 = CBTanU[CB_CornerIdx[3*i + 1]];

												Vector4D u20 = CBTanU[CB_CornerIdx[3*i + 2]];


												int val0 = quad->valences[i];

												int val1 = quad->valences[MOD4[i+1]];


												if ( quad->bndVtx[i] )

													val0--;

												if ( quad->bndVtx[MOD4[i+1]] )

													val1--;


												float c0 = cos( (2*PI * quad->loopGapAngle[     i   ] / 65535.0f) / val0 );

												float c1 = cos( (2*PI * quad->loopGapAngle[MOD4[i+1]] / 65535.0f) / val1 );


												CBTanV[ CB_InteriorIdx[2*i + 0] ] += CB_sign[i]*( 2*c0*u10 -   c1*u00 )/3.0f;

												CBTanV[ CB_InteriorIdx[2*i + 1] ] += CB_sign[i]*(   c0*u20 - 2*c1*u10 )/3.0f;

											}

										}


									}

								#endif


								}

								#endif  // !defined( USE_OPT )


								#if defined( USE_OPT )


								#define M_PI2			6.28318530717958647692f


								static fltx4 Four_NegativeThirds;

								static fltx4 Four_Fives;

								static fltx4 Four_Tens;

								static fltx4 Four_N[32];

								static fltx4 Four_TwoPI;

								static fltx4 Four_Valence[MAX_VALENCE];

								static fltx4 Four_ValencePlus5[MAX_VALENCE];


								static fltx4 sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2];

								static fltx4 sPosEdge1Stencil[MAX_VALENCE+1][6];

								static fltx4 sPosEdge2Stencil[MAX_VALENCE+1][6];

								static fltx4 sPosInteriorStencil[MAX_VALENCE+1][4];


								static fltx4 sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static fltx4 sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static fltx4 sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static fltx4 sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static fltx4 sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static fltx4 sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];


								static fltx4 sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

								static fltx4 sPosEdge1BndStencil[MAX_VALENCE+1][6];

								static fltx4 sPosEdge2BndStencil[MAX_VALENCE+1][6];

								static fltx4 sPosInteriorBndStencil[MAX_VALENCE+1][4];


								static fltx4 sPosEdge1CornerStencil[MAX_VALENCE+1][6];

								static fltx4 sPosEdge2CornerStencil[MAX_VALENCE+1][6];


								static fltx4 sCCSinPI[MAX_VALENCE*2][MAX_VALENCE];

								static fltx4 sCCCosPI[MAX_VALENCE*2][MAX_VALENCE];


								static float Valence_MinusOne[MAX_VALENCE];


								static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, fltx4 *stencilBuffer)

								{

									VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") );


									for ( int i=0; i<2*n; ++i )

									{

										stencilBuffer[i] = Four_Zeros;

									}


									if ( !boundary )

									{

										float scale = 1.0f / (n*n + 5.0f*n);


										stencilBuffer[0] = ReplicateX4( n*n * scale );


										for ( int i=0; i<n; i++ )

										{

											stencilBuffer[2*i+1] = ReplicateX4( 4.0f * scale );

											stencilBuffer[2*i+2] = ReplicateX4( 1.0f * scale );

										}

									}

									else

									{

										int k = n-1;


										float s = 1.0f / 6.0f;

										stencilBuffer[0]     = ReplicateX4( s * 4.0f );

										stencilBuffer[1]     = ReplicateX4( s * 1.0f );

										stencilBuffer[2*k+1] = ReplicateX4( s * 1.0f );

									}

								}


								static void ComputeCatmullClarkLimitTanStencil(bool bndVtx, bool cornerVtx, const int n, fltx4 *stencilBuffer1, fltx4 *stencilBuffer2)

								{

									VPROF_BUDGET( "ComputeCatmullClarkLimitTanStencil", _T("SubD Rendering") );


									for ( int i=0; i<2*n; ++i )

									{

										stencilBuffer1[i] = Four_Zeros;

										stencilBuffer2[i] = Four_Zeros;

									}


									if ( !bndVtx )

									{

										float scale_beta  = 1.0f / (n * sqrtf(4.0f + cos(PI/n)*cos(PI/n)));

										float scale_alpha = 1.0f/n + cos(PI/n) * scale_beta;


										for ( int i=0; i<n; i++ )

										{

											stencilBuffer1[2*i+1] = ReplicateX4( cos( 2*PI*i/n ) * scale_alpha );

											stencilBuffer1[2*i+2] = ReplicateX4( cos((2*PI*i+PI)/n ) * scale_beta );


											int j = (i - 1)%n;

											stencilBuffer2[2*i+1] = ReplicateX4( cos( 2*PI*j/n ) * scale_alpha );

											stencilBuffer2[2*i+2] = ReplicateX4( cos((2*PI*j+PI)/n ) * scale_beta );

										}

									}

									else

									{

										// boundary vertex cases

										if ( cornerVtx )

										{

											if ( n<=2 )

												return;


											float sectorScale = 0, w;

											// treat first and last tangent (crease edges) separately

											w = tangentAveraging( n-1, 0 ); sectorScale += w;

											stencilBuffer1[ 1] = stencilBuffer1[ 1] + ReplicateX4( 0.5 * w );

											stencilBuffer1[ 0] = stencilBuffer1[ 0] + ReplicateX4( -0.5 * w );


											w = tangentAveraging( n-1, n-1 ); sectorScale += w;

											stencilBuffer1[ 2*(n-1)+1] = stencilBuffer1[ 2*(n-1)+1] + ReplicateX4( 0.5 * w );

											stencilBuffer1[ 0 ]        = stencilBuffer1[ 0 ]        + ReplicateX4( -0.5 * w );


											// inner tangents are computed using the 6 weights from the geometery edge construction.

											for (int k=1; k<(n-1); k++)

											{

												w = tangentAveraging( n-1, k ); sectorScale += w;

												float scale = 1.0f / (2.0f*n + 10.0f);


												stencilBuffer1[        0] = stencilBuffer1[        0] + ReplicateX4( w * (2.0f*n * scale - 1.0f) );

												stencilBuffer1[2*(k-1)+1] = stencilBuffer1[2*(k-1)+1] + ReplicateX4( w *  2.0f   * scale );

												stencilBuffer1[2*(k-1)+2] = stencilBuffer1[2*(k-1)+2] + ReplicateX4( w *  1.0f   * scale );

												stencilBuffer1[2*(k-1)+3] = stencilBuffer1[2*(k-1)+3] + ReplicateX4( w *  4.0f   * scale );

												stencilBuffer1[2*(k-1)+4] = stencilBuffer1[2*(k-1)+4] + ReplicateX4( w *  1.0f   * scale );

												stencilBuffer1[2*(k-1)+5] = stencilBuffer1[2*(k-1)+5] + ReplicateX4( w *  2.0f   * scale );

											}


											// rescale weights

											fltx4 fltx4Scale = ReplicateX4( sectorScale );

											for ( int k = 0; k<2*n; ++k )

											{

												stencilBuffer1[k] = DivSIMD( stencilBuffer1[k], fltx4Scale );

											}


										}

										else

										{

											// special case to avoid colinear tangents

											if ( n==2 )

											{

												float s = 1.0f / 2.0f;

												stencilBuffer1[1] = ReplicateX4(  1.0 * s );

												stencilBuffer1[3] = ReplicateX4( -1.0 * s );


												stencilBuffer2[1] = ReplicateX4( -1.0 * s );

												stencilBuffer2[3] = ReplicateX4(  1.0 * s );


												// regularization term to avoid collinearity and preserve limit normal at the boundary

												float eps = 1e-4;

												stencilBuffer1[0] = AddSIMD( stencilBuffer1[0], ReplicateX4( eps * (-4.0/3.0) ) );

												stencilBuffer1[1] = AddSIMD( stencilBuffer1[1], ReplicateX4( eps * (1.0/2.0) ) );

												stencilBuffer1[2] = AddSIMD( stencilBuffer1[2], ReplicateX4( eps * (1.0/3.0) ) );

												stencilBuffer1[3] = AddSIMD( stencilBuffer1[3], ReplicateX4( eps * (1.0/2.0) ) );


												stencilBuffer2[0] = AddSIMD( stencilBuffer2[0], ReplicateX4( eps * (-4.0/3.0) ) );

												stencilBuffer2[1] = AddSIMD( stencilBuffer2[1], ReplicateX4( eps * (1.0/2.0) ) );

												stencilBuffer2[2] = AddSIMD( stencilBuffer2[2], ReplicateX4( eps * (1.0/3.0) ) );

												stencilBuffer2[3] = AddSIMD( stencilBuffer2[3], ReplicateX4( eps * (1.0/2.0) ) );

											}

											else

											{

												int k = n-1;

												float c = cos( PI / k ), s=sin( PI / k );


												stencilBuffer1[2*0+1] = ReplicateX4(  0.5f );

												stencilBuffer1[2*k+1] = ReplicateX4( -0.5f );


												stencilBuffer2[0] = ReplicateX4( -4.0f*s / (3.0f*k + c) );  // gamma


												for ( int i=0; i<k; ++i )

												{

													stencilBuffer2[2*i+1] = ReplicateX4( 4*sin(PI*i/k)/(3*k+c) );                   // alpha_i

													stencilBuffer2[2*i+2] = ReplicateX4( (sin(PI*i/k)+sin(PI*(i+1)/k)) / (3.0f*k+c) );   // beta_i

												}


												stencilBuffer2[2*0+1] = stencilBuffer2[2*k+1] = ReplicateX4( -( (1+2*c)*sqrt(1+c) ) / ( (3*k+c)*sqrt(1-c) ) );  // alpha_0, alpha_k

											}

										}


									}


								}


								static void ComputeACCEdgePosStencils(byte boundary, byte corner, int n, fltx4 *stencilBuffer1, fltx4 *stencilBuffer2)

								{

									VPROF_BUDGET( "ComputeACCEdgePosStencils", _T("SubD Rendering") );


									for ( int i=0; i<6; ++i )

									{

										stencilBuffer1[i] = Four_Zeros;

										stencilBuffer2[i] = Four_Zeros;

									}


									if ( !boundary )

									{

										float scale = 1.0f / (2.0f*n + 10.0f);


										stencilBuffer1[0] = ReplicateX4( 2.0f*n * scale ); stencilBuffer2[0] = ReplicateX4( 4.0f * scale );

										stencilBuffer1[1] = ReplicateX4( 2.0f * scale );   stencilBuffer2[1] = ReplicateX4( 1.0f * scale );

										stencilBuffer1[2] = ReplicateX4( 1.0f * scale );   stencilBuffer2[2] = ReplicateX4( 2.0f * scale );

										stencilBuffer1[3] = ReplicateX4( 4.0f * scale );   stencilBuffer2[3] = ReplicateX4( 2.0f*n* scale );

										stencilBuffer1[4] = ReplicateX4( 1.0f * scale );   stencilBuffer2[4] = ReplicateX4( 2.0f * scale );

										stencilBuffer1[5] = ReplicateX4( 2.0f * scale );   stencilBuffer2[5] = ReplicateX4( 1.0f * scale );

									}

									else

									{

										// boundary stencil

										if ( corner )

										{

											float scale = 1.0f / (3.0f);


											stencilBuffer1[0] = ReplicateX4( 2.0f * scale ); stencilBuffer2[0] = ReplicateX4( 1.0f * scale );

											stencilBuffer1[3] = ReplicateX4( 1.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f * scale );

										}

										else

										{

											float scale = 1.0f / 3.0f;


											stencilBuffer1[0] = ReplicateX4( 2.0f * scale ); stencilBuffer2[0] = ReplicateX4( 1.0f * scale );

											stencilBuffer1[3] = ReplicateX4( 1.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f * scale );

										}

									}

								}


								static void ComputeACCInteriorPosStencil(byte boundary, int n, fltx4 *stencilBuffer)

								{

									VPROF_BUDGET( "ComputeACCInteriorPosStencil", _T("SubD Rendering") );


									float scale = 1.0f / (n + 5.0f);


									stencilBuffer[0] = ReplicateX4( n * scale );

									stencilBuffer[1] = ReplicateX4( 2.0f * scale );

									stencilBuffer[2] = ReplicateX4( 1.0f * scale );

									stencilBuffer[3] = ReplicateX4( 2.0f * scale );

								}


								static void ComputeACCSinCosPITables()

								{

									fltx4 PI4 = ReplicateX4( M_PI );


									for ( int j=0; j<MAX_VALENCE*2; ++j )

									{

										fltx4 j4 = ReplicateX4( (float)j );


										for ( int k=0; k<MAX_VALENCE; ++k )

										{

											fltx4 k4 = ReplicateX4( (float)k );

											fltx4 radians = DivSIMD( MulSIMD( PI4, j4 ), k4 );


											// not really simd

											SinCosSIMD( sCCSinPI[j][k], sCCCosPI[j][k], radians );

										}

									}

								}


								void FillTables()

								{

									if ( sTableInited )

										return;


									// Some simd stuff

									Four_TwoPI = ReplicateX4( 2*M_PI );

									Four_Tens = ReplicateX4( 10.0f );

									Four_Fives = ReplicateX4( 5 );

									Four_NegativeThirds = ReplicateX4( -0.333333333333333f );

									for ( int i=0; i<32; ++i )

									{

										Four_N[i] = ReplicateX4( (float)i );

									}

									for ( int i=0; i<MAX_VALENCE; ++i )

									{

										Four_Valence[i] = ReplicateX4( (float)i );

										Four_ValencePlus5[i] = ReplicateX4( (float)i + 5.0f );

										Valence_MinusOne[i] = (float)(i-1);

									}


									for ( int val=0; val<=MAX_VALENCE; val++ )

									{

										// interior stencils

										ComputeCatmullClarkLimitPosStencil( false, val, sPosCornerStencil[val] );

										ComputeACCEdgePosStencils( false, false, val, sPosEdge1Stencil[val], sPosEdge2Stencil[val] );

										ComputeACCInteriorPosStencil( false, val, sPosInteriorStencil[val] );


										// boundary stencils

										ComputeCatmullClarkLimitPosStencil( true, val, sPosCornerBndStencil[val] );

										ComputeACCEdgePosStencils( true, false, val, sPosEdge1BndStencil[val], sPosEdge2BndStencil[val] );

										ComputeACCEdgePosStencils( true, true, val, sPosEdge1CornerStencil[val], sPosEdge2CornerStencil[val] );

										ComputeACCInteriorPosStencil( true, val, sPosInteriorBndStencil[val] );


										ComputeCatmullClarkLimitTanStencil( false, false, val, sCCLimitTanStencil1[val], sCCLimitTanStencil2[val] );

										ComputeCatmullClarkLimitTanStencil( true, false, val, sCCLimitTanBndStencil1[val], sCCLimitTanBndStencil2[val] );

										ComputeCatmullClarkLimitTanStencil( true, true, val, sCCLimitTanCornerStencil1[val], sCCLimitTanCornerStencil2[val] );

									}


									// sincos tables

									ComputeACCSinCosPITables();


									sTableInited = true;

								}


								//--------------------------------------------------------------------------------------

								// Runtime

								//--------------------------------------------------------------------------------------

								FORCEINLINE void ComputeCatmullClarkLimitPosition( fltx4 *pPos, unsigned short *pOneRing,

																				   unsigned short vtx1RingSize, unsigned short minOneRingIndex, unsigned short bndVtx,

																				   unsigned short cornerVtx, unsigned short valence, fltx4 &limitPos )

								{

									VPROF_BUDGET( "ComputeCatmullClarkLimitPosition (SIMD)", _T( "SubD Rendering" ) );


									assert( pPos );

									assert( pOneRing );


									if ( cornerVtx > 0 )

									{

										limitPos = pPos[ pOneRing[0]  ];

									}

									else

									{

										assert( valence <= MAX_VALENCE );


										fltx4 *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ];


										// pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer)

										limitPos = MulSIMD( pStencil[0], pPos[ pOneRing[0] ] );

										for ( int k = 0; k < vtx1RingSize; k++ )

										{

											int idx = ( k + minOneRingIndex ) % vtx1RingSize;	// Shuffle to get the minimum index consistently first in order

											if ( idx != 0 )										// Don't do pStencil[0] again

											{

												limitPos = MaddSIMD( pStencil[idx], pPos[ pOneRing[idx] ], limitPos );

											}

										}

									}

								}


								FORCEINLINE fltx4 VectorNormalize( fltx4 &A )

								{

									fltx4 mag_sq = Dot3SIMD( A, A );						// length^2

									fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq);

									return MulSIMD( A, invSqrt );

								}


								FORCEINLINE fltx4 VectorLength( fltx4 &A )

								{

									fltx4 mag_sq = Dot3SIMD( A, A );						// length^2

									fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq);

									return invSqrt;

								}


								FORCEINLINE fltx4 CrossProduct( const fltx4 &A, const fltx4 &B )

								{

								#if defined( _X360 )

									return XMVector3Cross( A, B );

								#elif defined( _WIN32 )

									fltx4 A1 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );

									fltx4 B1 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );

									fltx4 Result1 = MulSIMD( A1, B1 );

									fltx4 A2 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );

									fltx4 B2 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );

									fltx4 Result2 = MulSIMD( A2, B2 );

									return SubSIMD( Result1, Result2 );

								#else

									fltx4 CrossVal;

									SubFloat( CrossVal, 0 ) = SubFloat( A, 1 )*SubFloat( B, 2 ) - SubFloat( A, 2 )*SubFloat( B, 1 );

									SubFloat( CrossVal, 1 ) = SubFloat( A, 2 )*SubFloat( B, 0 ) - SubFloat( A, 0 )*SubFloat( B, 2 );

									SubFloat( CrossVal, 2 ) = SubFloat( A, 0 )*SubFloat( B, 1 ) - SubFloat( A, 1 )*SubFloat( B, 0 );

									SubFloat( CrossVal, 3 ) = 0;

									return CrossVal;

								#endif

								}


								FORCEINLINE void ComputeCatmullClarkLimitTangents( int idx, fltx4 *pPos, unsigned short *pOneRing, unsigned short vtx1RingSize,

																			 unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx,

																			 unsigned short valence, float &loopGapAngle, fltx4 &limitTanU, fltx4 &limitTanV )

								{

									VPROF_BUDGET( "ComputeCatmullClarkLimitTangents (SIMD)", _T( "SubD Rendering" ) );


									// for valence=1, no need to have separate tangents

									static const fltx4 tanUSign[4] = { Four_Ones, Four_NegativeOnes, Four_NegativeOnes, Four_Ones };

									static const fltx4 tanVSign[4] = { Four_Ones, Four_Ones, Four_NegativeOnes, Four_NegativeOnes };


									if (!sUseCornerTangents) cornerVtx = 0;


									// interior vertices

									if ( !bndVtx )

									{

										fltx4 *pStencil0 = sCCLimitTanStencil1[ valence ];

										fltx4 *pStencil1 = sCCLimitTanStencil2[ valence ];


										limitTanU = limitTanV = Four_Zeros;


										for ( int k = 0; k < vtx1RingSize; k++ )

										{

											limitTanU = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], limitTanU );

											limitTanV = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], limitTanV );

										}


									}

									else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) )

									{

										// smooth boundary vertices

										fltx4 *pStencil0 = sCCLimitTanBndStencil1[ valence ];

										fltx4 *pStencil1 = sCCLimitTanBndStencil2[ valence ];


										fltx4 r0 = Four_Zeros;

										fltx4 r1 = Four_Zeros;


										for (int k = 0; k < vtx1RingSize; ++k)

										{

											r0 = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], r0 );

											r1 = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], r1 );

										}


										int j1 = ( centerOffset - 1 ) / 2;

										int j2 = j1 + 1;

										int k = valence - 1;


										if ( valence == 2 )

										{

											limitTanU = r0;

											limitTanV = r1;

										}

										else

										{

											limitTanU = AddSIMD( MulSIMD( sCCCosPI[j1][k], r0 ), MulSIMD( sCCSinPI[j1][k], r1 ) );

											limitTanV = AddSIMD( MulSIMD( sCCCosPI[j2][k], r0 ), MulSIMD( sCCSinPI[j2][k], r1 ) );

										}

									}

									else

									{

										// Corner vertices

										if ( valence == 2 )

											return;


										fltx4 *pEdgeStencil = sPosEdge1Stencil[ valence ];


										// Compute tangents

										fltx4 c0 = SubSIMD( pPos[ pOneRing[ 1 ] ], pPos[ pOneRing[ 0 ] ] );

										fltx4 c1 = SubSIMD( pPos[ pOneRing[ vtx1RingSize - 1 ] ], pPos[ pOneRing[ 0 ] ] );


										fltx4 e0 = MulSIMD( SubSIMD( pEdgeStencil[0], Four_Ones ), pPos[ pOneRing[ 0 ] ] );

										fltx4 e1 = e0;

										for ( int k = 1; k < 6; k++ )

										{

											e0 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ k ] ], e0 );

											e1 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ vtx1RingSize - 6 + k ] ], e1 );

										}


										// Compute average tangent plane normal

										fltx4 n0 = CrossProduct( c0, e0 );

										n0 = VectorNormalize( n0 );

										fltx4 n1 = CrossProduct( e1, c1 );

										n1 = VectorNormalize( n1 );

										fltx4 N = AddSIMD( n0, n1 );

										N = VectorNormalize( N );


										// Project into tangent plane

										fltx4 DotC0N = Dot3SIMD( c0, N );

										fltx4 DotC1N = Dot3SIMD( c1, N );


										c0 = SubSIMD( c0, MulSIMD( DotC0N, N ) );

										c1 = SubSIMD( c1, MulSIMD( DotC1N, N ) );


										fltx4 c0l = VectorLength( c0 );

										c0 = DivSIMD( c0, c0l );

										fltx4 c1l = VectorLength( c1 );

										c1 = DivSIMD( c1, c1l );

										fltx4 cAvg = MulSIMD( AddSIMD(c0l,c1l), Four_PointFives );


										// Compute angle

										fltx4 c0p = CrossProduct(N, c0);

										fltx4 dot1 = Dot3SIMD(c0p, c1);

										fltx4 dot2 = Dot3SIMD(c0, c1);


										float angle = PI - atan2( SubFloat( dot1, 0 ), -SubFloat( dot2, 0 ) );


										loopGapAngle = angle;


										// Compute final tangent vector

										int j1 = ( centerOffset - 1 ) / 2;

										int j2 = j1 + 1;

										int K = (valence - 1);


										static float fK[MAX_VALENCE] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,

																		 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,

																		 17.0f, 18.0f };

										// Compute final tangent vector

										float flK = fK[K];


										fltx4 Cos0 = ReplicateX4( cos( angle*j1 / flK ) );

										fltx4 Sin0 = ReplicateX4( sin( angle*j1 / flK ) );

										fltx4 Cos1 = ReplicateX4( cos( angle*j2 / flK ) );

										fltx4 Sin1 = ReplicateX4( sin( angle*j2 / flK ) );


										limitTanU = cAvg * ( Cos0 * c0 + Sin0 * c0p );

										limitTanV = cAvg * ( Cos1 * c0 + Sin1 * c0p );

									}


									// Flip tangents so they point in u/v direction

									if ( idx & 1 )

									{

										V_swap( limitTanU, limitTanV );

									}


									limitTanU = MulSIMD( limitTanU, tanUSign[idx] );

									limitTanV = MulSIMD( limitTanV, tanVSign[idx] );

								}


								FORCEINLINE void ComputeACCEdgePositions( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset,

																	unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1,

																	unsigned short cornerVtx0, unsigned short cornerVtx1,

																	unsigned short edgeBias0, unsigned short edgeBias1,

																	unsigned short val0, unsigned short val1,

																	unsigned short minOneRingOffset, unsigned short vtx1RingSize,

																	fltx4 &edgePos0, fltx4 &edgePos1)

								{

									VPROF_BUDGET( "ComputeACCEdgePositions (SIMD)", _T("SubD Rendering") );


									if ( bndVtx0 )

									{

										val0 = 2*(val0 - 1);

									}


									if ( bndVtx1 )

									{

										val1 = 2*(val1 - 1);

									}


									Assert( val0 <= MAX_VALENCE );

									Assert( val1 <= MAX_VALENCE );


									fltx4 *pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ];

									fltx4 *pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ];


									int kEnd = (bndEdge) ? 4 : 6;


									if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) )

									{

										int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 };

										for ( int i = 1; i < kEnd; i++ )

										{

											oneRingIndex[i] = centerOffset + i - 1;

										}


										edgePos0 = edgePos1 = Four_Zeros;

										for ( int k = 0; k < kEnd; k++ )

										{

											int idx = ( k + minOneRingOffset ) % kEnd;	// Offset to min index to enforce evaluation order between neighboring patches

											edgePos0 = MaddSIMD( pStencil0[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos0 );

											edgePos1 = MaddSIMD( pStencil1[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos1 );

										}

									}

									else

									{

										fltx4 b0, b1;

										b1 = ReplicateX4( edgeBias0 / 32768.0f );

										b0 = SubSIMD( Four_Ones, b1 );

										edgePos0 = DivSIMD( ( Four_Valence[val0]*pPos[ oneRing[0] ] +

													    Four_Twos*b0*pPos[ oneRing[ centerOffset] ] +

															   b0*pPos[ oneRing[centerOffset + 1] ] +

														Four_Twos*pPos[ oneRing[centerOffset + 2] ] +

															   b1*pPos[ oneRing[centerOffset + 3] ] +

													 Four_Twos*b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] );


										b1 = ReplicateX4( edgeBias1 / 32768.0f );

										b0 = SubSIMD( Four_Ones, b1 );

										edgePos1 = DivSIMD( ( Four_Twos*pPos[ oneRing[0] ] +

													  b0*pPos[ oneRing[centerOffset + 0] ] +

											Four_Twos*b0*pPos[ oneRing[centerOffset + 1] ] +

									  Four_Valence[val1]*pPos[ oneRing[centerOffset + 2] ] +

											Four_Twos*b1*pPos[ oneRing[centerOffset + 3] ] +

													  b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] );

									}

								}


								FORCEINLINE void ComputeACCInteriorPosition( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, fltx4 &interiorPos )

								{

									VPROF_BUDGET( "ComputeACCInteriorPosition (SIMD)", _T( "SubD Rendering" ) );


									if ( bndVtx )

									{

										valence = valence > 2 ?  2 * (valence - 1) : 4 * (valence - 1);

									}


									Assert( valence <= MAX_VALENCE );


									fltx4 *pStencil = sPosInteriorStencil[ valence ];


									interiorPos = MulSIMD( pStencil[0], pPos[ oneRing[0] ] );

									for ( int k = 1; k < 4; k++ )

									{

										interiorPos = MaddSIMD( pStencil[k], pPos[ oneRing[ centerOffset + k - 1 ] ], interiorPos );

									}

								}


								FORCEINLINE void ComputeACCGeometryPatchTangents( fltx4 *Pos, fltx4 *TanU, fltx4 *TanV )

								{

									//VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") );

									TanU[0] = MulSIMD( Four_Threes, SubSIMD( Pos[1], Pos[0] ) );

									TanV[0] = MulSIMD( Four_Threes, SubSIMD( Pos[4], Pos[0] ) );

									TanU[3] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[4] ) );

									TanV[1] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[1] ) );

									TanU[6] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[8] ) );

									TanV[2] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[2] ) );

									TanU[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[12] ) );

									TanV[3] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[3] ) );

									TanU[1] = MulSIMD( Four_Threes, SubSIMD( Pos[2], Pos[1] ) );

									TanV[4] = MulSIMD( Four_Threes, SubSIMD( Pos[8], Pos[4] ) );

									TanU[4] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[5] ) );

									TanV[5] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[5] ) );

									TanU[7] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[9] ) );

									TanV[6] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[6] ) );

									TanU[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[13] ) );

									TanV[7] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[7] ) );

									TanU[2] = MulSIMD( Four_Threes, SubSIMD( Pos[3], Pos[2] ) );

									TanV[8] = MulSIMD( Four_Threes, SubSIMD( Pos[12], Pos[8] ) );

									TanU[5] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[6] ) );

									TanV[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[9] ) );

									TanU[8] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[10] ) );

									TanV[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[10] ) );

									TanU[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[14] ) );

									TanV[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[11] ) );

								}


								void ComputeACCAllPatches( fltx4* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV, bool bRegularPatch )

								{

									VPROF_BUDGET( "ComputeACCAllPatches (SIMD)", _T( "SubD Rendering" ) );

									int accCorner[]     = { 0, 3, 15, 12 };

									int accEdge1[]      = { 4, 2, 11, 13 };

									int accEdge2[]      = { 8, 1, 7,  14 };

									int accInterior[]   = { 5, 6, 10, 9  };

									int accTanCornerU[] = { 0, 2, 11, 9  };  // counterclockwise orders!

									int accTanCornerV[] = { 0, 3, 11, 8  };


									fltx4 OutPos[16], OutTanU[16], OutTanV[16];


									// Point to four one-rings

									int vtx1RingStart = 0;

									unsigned short* pOneRing[4];

									for ( int i = 0; i < 4; i++ )

									{

										unsigned short vtx1RingSize = quad->vtx1RingSize[i];

										pOneRing[i] = &(quad->oneRing[vtx1RingStart]);

										vtx1RingStart += vtx1RingSize;

									}


									{

										VPROF_BUDGET( "ComputeACCAllPatches - Geometry Control Points (SIMD)", _T( "SubD Rendering" ) );


										ComputeCatmullClarkLimitPosition( pPos, pOneRing[0], quad->vtx1RingSize[0], quad->minOneRingOffset[0], quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], OutPos[ accCorner[0] ] );

										ComputeCatmullClarkLimitPosition( pPos, pOneRing[1], quad->vtx1RingSize[1], quad->minOneRingOffset[1], quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], OutPos[ accCorner[1] ] );

										ComputeCatmullClarkLimitPosition( pPos, pOneRing[2], quad->vtx1RingSize[2], quad->minOneRingOffset[2], quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], OutPos[ accCorner[2] ] );

										ComputeCatmullClarkLimitPosition( pPos, pOneRing[3], quad->vtx1RingSize[3], quad->minOneRingOffset[3], quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], OutPos[ accCorner[3] ] );


										ComputeACCEdgePositions( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0],

																 quad->bndEdge[3], quad->bndVtx[0], quad->bndVtx[3],

																 quad->cornerVtx[0], quad->cornerVtx[3],

																 quad->edgeBias[6], quad->edgeBias[7],

																 quad->valences[0], quad->valences[3],

																 quad->minOneRingOffset[0], quad->vtx1RingSize[0],

																 OutPos[accEdge1[0]], OutPos[accEdge2[0]] );

										ComputeACCEdgePositions( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1],

																 quad->bndEdge[0], quad->bndVtx[1], quad->bndVtx[0],

																 quad->cornerVtx[1], quad->cornerVtx[0],

																 quad->edgeBias[0], quad->edgeBias[1],

																 quad->valences[1], quad->valences[0],

																 quad->minOneRingOffset[1], quad->vtx1RingSize[1],

																 OutPos[accEdge1[1]], OutPos[accEdge2[1]] );

										ComputeACCEdgePositions( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2],

																 quad->bndEdge[1], quad->bndVtx[2], quad->bndVtx[1],

																 quad->cornerVtx[2], quad->cornerVtx[1],

																 quad->edgeBias[2], quad->edgeBias[3],

																 quad->valences[2], quad->valences[1],

																 quad->minOneRingOffset[2], quad->vtx1RingSize[2],

																 OutPos[accEdge1[2]], OutPos[accEdge2[2]] );

										ComputeACCEdgePositions( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3],

																 quad->bndEdge[2], quad->bndVtx[3], quad->bndVtx[2],

																 quad->cornerVtx[3], quad->cornerVtx[2],

																 quad->edgeBias[4], quad->edgeBias[5],

																 quad->valences[3], quad->valences[2],

																 quad->minOneRingOffset[3], quad->vtx1RingSize[3],

																 OutPos[accEdge1[3]], OutPos[accEdge2[3]] );


										ComputeACCInteriorPosition( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0], quad->bndVtx[0], quad->valences[0], OutPos[ accInterior[0] ] );

										ComputeACCInteriorPosition( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1], quad->bndVtx[1], quad->valences[1], OutPos[ accInterior[1] ] );

										ComputeACCInteriorPosition( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2], quad->bndVtx[2], quad->valences[2], OutPos[ accInterior[2] ] );

										ComputeACCInteriorPosition( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3], quad->bndVtx[3], quad->valences[3], OutPos[ accInterior[3] ] );

									}


								#if !defined( NO_TANGENTS )

									// Don't compute tangents for regular patches

								#if defined( SEPARATE_REGULAR_AND_EXTRA )

									if ( !bRegularPatch )

								#endif

									{

										VPROF_BUDGET( "ComputeACCAllPatches - Tangents (SIMD)", _T( "SubD Rendering" ) );


										ComputeACCGeometryPatchTangents( OutPos, OutTanU, OutTanV );


										float flLoopGap[4];

										flLoopGap[0] = ( M_PI2 * quad->loopGapAngle[0] ) / 65535.0f;

										flLoopGap[1] = ( M_PI2 * quad->loopGapAngle[1] ) / 65535.0f;

										flLoopGap[2] = ( M_PI2 * quad->loopGapAngle[2] ) / 65535.0f;

										flLoopGap[3] = ( M_PI2 * quad->loopGapAngle[3] ) / 65535.0f;

										if ( !sShowACCGeometryTangents )

										{

											{

												ComputeCatmullClarkLimitTangents( 0, pPos, pOneRing[0], quad->vtx1RingSize[0], quad->vtx1RingCenterQuadOffset[0],

													quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], flLoopGap[0], OutTanU[ accTanCornerU[0] ], OutTanV[ accTanCornerV[0] ] );

												ComputeCatmullClarkLimitTangents( 1, pPos, pOneRing[1], quad->vtx1RingSize[1], quad->vtx1RingCenterQuadOffset[1],

													quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], flLoopGap[1], OutTanU[ accTanCornerU[1] ], OutTanV[ accTanCornerV[1] ] );

												ComputeCatmullClarkLimitTangents( 2, pPos, pOneRing[2], quad->vtx1RingSize[2], quad->vtx1RingCenterQuadOffset[2],

													quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], flLoopGap[2], OutTanU[ accTanCornerU[2] ], OutTanV[ accTanCornerV[2] ] );

												ComputeCatmullClarkLimitTangents( 3, pPos, pOneRing[3], quad->vtx1RingSize[3], quad->vtx1RingCenterQuadOffset[3],

													quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], flLoopGap[3], OutTanU[ accTanCornerU[3] ], OutTanV[ accTanCornerV[3] ] );

											}


											// compute correction component to boundary tangents for tangent plane continuity

											//                             /TanV/ /TanU/ / TanV / /TanU/

											static int   CB_CornerIdx[]   = {0,1,2, 3,7,11, 11,10,9, 8,4,0 };

											static int   CB_InteriorIdx[] = {1,2,   5,8,    10,9,    6,3 };

											static fltx4 CB_sign[4] = {Four_Ones,Four_NegativeOnes,Four_Ones,Four_NegativeOnes};


											{

												// Unroll, since the compiler wants to keep it rolled, and we get better perf unrolled

												{

													fltx4 u00 = OutTanU[CB_CornerIdx[0]];

													fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[1]], Four_Twos );

													fltx4 u20 = OutTanU[CB_CornerIdx[2]];


													int val0 = quad->valences[0];	int val1 = quad->valences[1];

													if ( quad->bndVtx[0] ) val0--;

													if ( quad->bndVtx[1] ) val1--;


													fltx4 c0 = ReplicateX4( cosf( (flLoopGap[0]) / val0 ) );

													fltx4 c1 = ReplicateX4( cosf( (flLoopGap[1]) / val1 ) );


													fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 );	fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );

													fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );


													OutTanV[CB_InteriorIdx[0] ] = AddSIMD( OutTanV[CB_InteriorIdx[0] ], E );

													OutTanV[CB_InteriorIdx[1] ] = AddSIMD( OutTanV[CB_InteriorIdx[1] ], F );

												}


												{

													fltx4 u00 = OutTanV[CB_CornerIdx[3]];

													fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[4]], Four_Twos );

													fltx4 u20 = OutTanV[CB_CornerIdx[5]];


													int val0 = quad->valences[1];	int val1 = quad->valences[2];

													if ( quad->bndVtx[1] ) val0--;

													if ( quad->bndVtx[2] ) val1--;


													fltx4 c0 = ReplicateX4( cosf( (flLoopGap[1]) / val0 ) );

													fltx4 c1 = ReplicateX4( cosf( (flLoopGap[2]) / val1 ) );


													fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 );	fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );

													fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );


													OutTanU[CB_InteriorIdx[2] ] = SubSIMD( OutTanU[CB_InteriorIdx[2] ], E );

													OutTanU[CB_InteriorIdx[3] ] = SubSIMD( OutTanU[CB_InteriorIdx[3] ], F );

												}


												{

													fltx4 u00 = OutTanU[CB_CornerIdx[6]];

													fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[7]], Four_Twos );

													fltx4 u20 = OutTanU[CB_CornerIdx[8]];


													int val0 = quad->valences[2];	int val1 = quad->valences[3];

													if ( quad->bndVtx[2] ) val0--;

													if ( quad->bndVtx[3] ) val1--;


													fltx4 c0 = ReplicateX4( cosf( (flLoopGap[2]) / val0 ) );

													fltx4 c1 = ReplicateX4( cosf( (flLoopGap[3]) / val1 ) );


													fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 );	fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );

													fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );


													OutTanV[CB_InteriorIdx[4] ] = AddSIMD( OutTanV[CB_InteriorIdx[4] ], E );

													OutTanV[CB_InteriorIdx[5] ] = AddSIMD( OutTanV[CB_InteriorIdx[5] ], F );

												}


												{

													fltx4 u00 = OutTanV[CB_CornerIdx[9]];

													fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[10]], Four_Twos );

													fltx4 u20 = OutTanV[CB_CornerIdx[11]];


													int val0 = quad->valences[3];	int val1 = quad->valences[0];

													if ( quad->bndVtx[3] ) val0--;

													if ( quad->bndVtx[0] ) val1--;


													fltx4 c0 = ReplicateX4( cosf( (flLoopGap[3]) / val0 ) );

													fltx4 c1 = ReplicateX4( cosf( (flLoopGap[0]) / val1 ) );


													fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 );	fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );

													fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );


													OutTanU[CB_InteriorIdx[6] ] = SubSIMD( OutTanU[CB_InteriorIdx[6] ], E );

													OutTanU[CB_InteriorIdx[7] ] = SubSIMD( OutTanU[CB_InteriorIdx[7] ], F );

												}

											}

										}


										StoreAlignedSIMD( (float*)&TanU[0], OutTanU[0] );

										StoreAlignedSIMD( (float*)&TanU[1], OutTanU[1] );

										StoreAlignedSIMD( (float*)&TanU[2], OutTanU[2] );

										StoreAlignedSIMD( (float*)&TanU[3], OutTanU[3] );

										StoreAlignedSIMD( (float*)&TanU[4], OutTanU[4] );

										StoreAlignedSIMD( (float*)&TanU[5], OutTanU[5] );

										StoreAlignedSIMD( (float*)&TanU[6], OutTanU[6] );

										StoreAlignedSIMD( (float*)&TanU[7], OutTanU[7] );

										StoreAlignedSIMD( (float*)&TanU[8], OutTanU[8] );

										StoreAlignedSIMD( (float*)&TanU[9], OutTanU[9] );

										StoreAlignedSIMD( (float*)&TanU[10], OutTanU[10] );

										StoreAlignedSIMD( (float*)&TanU[11], OutTanU[11] );


										StoreAlignedSIMD( (float*)&TanV[0], OutTanV[0] );

										StoreAlignedSIMD( (float*)&TanV[1], OutTanV[1] );

										StoreAlignedSIMD( (float*)&TanV[2], OutTanV[2] );

										StoreAlignedSIMD( (float*)&TanV[3], OutTanV[3] );

										StoreAlignedSIMD( (float*)&TanV[4], OutTanV[4] );

										StoreAlignedSIMD( (float*)&TanV[5], OutTanV[5] );

										StoreAlignedSIMD( (float*)&TanV[6], OutTanV[6] );

										StoreAlignedSIMD( (float*)&TanV[7], OutTanV[7] );

										StoreAlignedSIMD( (float*)&TanV[8], OutTanV[8] );

										StoreAlignedSIMD( (float*)&TanV[9], OutTanV[9] );

										StoreAlignedSIMD( (float*)&TanV[10], OutTanV[10] );

										StoreAlignedSIMD( (float*)&TanV[11], OutTanV[11] );

									}


								#endif


									StoreAlignedSIMD( (float*)&Pos[0], OutPos[0] );

									StoreAlignedSIMD( (float*)&Pos[1], OutPos[1] );

									StoreAlignedSIMD( (float*)&Pos[2], OutPos[2] );

									StoreAlignedSIMD( (float*)&Pos[3], OutPos[3] );

									StoreAlignedSIMD( (float*)&Pos[4], OutPos[4] );

									StoreAlignedSIMD( (float*)&Pos[5], OutPos[5] );

									StoreAlignedSIMD( (float*)&Pos[6], OutPos[6] );

									StoreAlignedSIMD( (float*)&Pos[7], OutPos[7] );

									StoreAlignedSIMD( (float*)&Pos[8], OutPos[8] );

									StoreAlignedSIMD( (float*)&Pos[9], OutPos[9] );

									StoreAlignedSIMD( (float*)&Pos[10], OutPos[10] );

									StoreAlignedSIMD( (float*)&Pos[11], OutPos[11] );

									StoreAlignedSIMD( (float*)&Pos[12], OutPos[12] );

									StoreAlignedSIMD( (float*)&Pos[13], OutPos[13] );

									StoreAlignedSIMD( (float*)&Pos[14], OutPos[14] );

									StoreAlignedSIMD( (float*)&Pos[15], OutPos[15] );

								}


								#endif