#include "r_studiosubd_patches.h" #include "tier1/convar.h" #include #define PI 3.14159265 #ifdef _DEBUG CUtlVector g_DebugCornerPositions; CUtlVector g_DebugEdgePositions; CUtlVector g_DebugInteriorPositions; #endif //---------------------------------------------------------------------------------------------- // static stencil buffers //---------------------------------------------------------------------------------------------- #if !defined( USE_OPT ) static float sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2]; static float sPosEdge1Stencil[MAX_VALENCE+1][6]; static float sPosEdge2Stencil[MAX_VALENCE+1][6]; static float sPosInteriorStencil[MAX_VALENCE+1][4]; static float sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static float sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static float sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static float sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static float sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static float sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static float sPosGregoryInterior1Stencil[6]; static float sPosGregoryInterior2Stencil[6]; static float sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static float sPosEdge1BndStencil[MAX_VALENCE+1][6]; static float sPosEdge2BndStencil[MAX_VALENCE+1][6]; static float sPosInteriorBndStencil[MAX_VALENCE+1][4]; static float sPosEdge1CornerStencil[MAX_VALENCE+1][6]; static float sPosEdge2CornerStencil[MAX_VALENCE+1][6]; #endif static bool sTableInited = false; static bool sCornerCorrection = false; static bool sShowACCGeometryTangents = false; static bool sUseCornerTangents = true; void set_ShowACCGeometryTangents(bool v) { sShowACCGeometryTangents = v; } void set_CornerCorrection(bool v) { sCornerCorrection = v; } void set_UseCornerTangents(bool v) { sUseCornerTangents = v; } // averaging function over geometry patch tangents. static float tangentAveraging( int n, int j) { return sin( PI * j / (float) n ); } //-------------------------------------------------------------------------------------- // Subdiv Stencils //-------------------------------------------------------------------------------------- #if !defined( USE_OPT ) static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, float *stencilBuffer) { VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") ); memset(stencilBuffer, 0, 2*n*sizeof(float)); if (!boundary) { float scale = 1.0f / (n*n + 5.0f*n); stencilBuffer[0] = n*n * scale; for (int i=0; i 0 ) { limitPos = pPos[ oneRing[0] ]; } else { assert( valence <= MAX_VALENCE ); float *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ]; // pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer) limitPos = pStencil[0] * pPos[ oneRing[0] ]; for ( int k = 0; k < vtx1RingSize; k++ ) { int idx = ( k + minOneRingIndex ) % vtx1RingSize; // Shuffle to get the minimum index consistently first in order if ( idx != 0 ) // Don't do pStencil[0] again { limitPos += pStencil[idx] * pPos[ oneRing[idx] ]; } } } #ifdef _DEBUG g_DebugCornerPositions.AddToTail( limitPos ); #endif } inline Vector4D CrossProduct(const Vector4D& a, const Vector4D& b) { return Vector4D( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f ); } inline float VectorNormalize(Vector4D& vec) { float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z); // FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero. float iradius = 1.f / ( radius + FLT_EPSILON ); vec.x *= iradius; vec.y *= iradius; vec.z *= iradius; return radius; } FORCEINLINE float DotProduct(const Vector4D& a, const Vector4D& b) { return ( a.x*b.x + a.y*b.y + a.z*b.z ); } inline void ComputeCatmullClarkLimitTangents( int idx, Vector4D *pPos, unsigned short *oneRing, unsigned short vtx1RingSize, unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx, unsigned short valence, unsigned short &loopGapAngle, Vector4D &limitTanU, Vector4D &limitTanV ) { // for valence=1, no need to have separate tangents float tanUSign[] = {1,-1,-1,1}; float tanVSign[] = {1,1,-1,-1}; VPROF_BUDGET( "ComputeCatmullClarkLimitTangents", _T("SubD Rendering") ); if ( !sUseCornerTangents ) cornerVtx = 0; if ( !bndVtx ) // interior vertices { float *stencil1 = sCCLimitTanStencil1[ valence ]; float *stencil2 = sCCLimitTanStencil2[ valence ]; limitTanU = Vector4D(0,0,0,0); limitTanV = Vector4D(0,0,0,0); for (int k = 0; k < vtx1RingSize; ++k) { limitTanU += stencil1[k] * pPos[ oneRing[k] ]; limitTanV += stencil2[k] * pPos[ oneRing[k] ]; } } else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) ) // smooth boundary vertices { float *stencil1 = sCCLimitTanBndStencil1[ valence ]; float *stencil2 = sCCLimitTanBndStencil2[ valence ]; Vector4D r0 = Vector4D(0,0,0,0); Vector4D r1 = Vector4D(0,0,0,0); for (int k = 0; k < vtx1RingSize; ++k) { r0 += stencil1[k] * pPos[ oneRing[k] ]; r1 += stencil2[k] * pPos[ oneRing[k] ]; } int j1 = (centerOffset - 1) / 2; int j2 = j1+1; int K = (valence - 1); if (valence == 2) { limitTanU = r0; limitTanV = r1; } else { limitTanU = cos(PI*j1 / K) * r0 + sin(PI*j1 / K) * r1; limitTanV = cos(PI*j2 / K) * r0 + sin(PI*j2 / K) * r1; } } else // corner vertices { if ( valence == 2 ) return; float *pEdgeStencil = sPosEdge1Stencil[ valence ]; // float *avgStencil = sCCLimitTanCornerStencil1[ valence ]; // compute tangents Vector4D c0 = pPos[ oneRing[1] ] - pPos[ oneRing[0] ]; c0.w = 0; Vector4D c1 = pPos[ oneRing[vtx1RingSize - 1] ] - pPos[ oneRing[0] ]; c1.w = 0; Vector4D e0 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ]; Vector4D e1 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ]; for (int k = 1; k < 6; k++ ) { e0 += pEdgeStencil[k] * pPos[ oneRing[ k ] ]; e1 += pEdgeStencil[k] * pPos[ oneRing[ vtx1RingSize - 6 + k ] ]; } e0.w = 0; e1.w = 0; // compute average tangent plane normal Vector4D n0 = CrossProduct( c0, e0 ); VectorNormalize( n0 ); Vector4D n1 = CrossProduct( e1, c1 ); VectorNormalize( n1 ); Vector4D N = n0 + n1; // N = N - ( DotProduct( N, tAvg )/ DotProduct(tAvg, tAvg) ) * tAvg; VectorNormalize( N ); // project into tangent plane c0 = c0 - DotProduct(c0, N) * N; c1 = c1 - DotProduct(c1, N) * N; float c0l = Vector4DLength( c0 ); c0 = c0 / c0l; float c1l = Vector4DLength( c1 ); c1 = c1 / c1l; float cAvg = (c0l + c1l) / 2; // compute angle Vector4D c0p = CrossProduct(N, c0); float angle = PI - atan2( DotProduct(c0p, c1), -DotProduct(c0, c1) ); loopGapAngle = (unsigned int) ((65535.0 * angle) / (2*PI)); // compute final tangent vector int j1 = (centerOffset - 1) / 2; int j2 = j1+1; int K = (valence - 1); limitTanU = cAvg * ( cos(angle*j1 / K) * c0 + sin(angle*j1 / K) * c0p ); limitTanV = cAvg * ( cos(angle*j2 / K) * c0 + sin(angle*j2 / K) * c0p ); } // flip tangents so they point in u/v direction if ( idx & 1 ) { swap(limitTanU, limitTanV); } limitTanU *= tanUSign[idx]; limitTanV *= tanVSign[idx]; } inline void ComputeACCEdgePositions( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1, unsigned short cornerVtx0, unsigned short cornerVtx1, unsigned short loopGapAngle0, unsigned short loopGapAngle1, unsigned short edgeBias0, unsigned short edgeBias1, unsigned short val0, unsigned short val1, unsigned short minOneRingOffset, unsigned short vtx1RingSize, Vector4D &edgePos0, Vector4D &edgePos1) { VPROF_BUDGET( "ComputeACCEdgePositions", _T("SubD Rendering") ); if ( bndVtx0 ) { val0 = 2*(val0 - 1); } if ( bndVtx1 ) { val1 = 2*(val1 - 1); } Assert( val0 <= MAX_VALENCE ); Assert( val1 <= MAX_VALENCE ); float* pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ]; float* pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ]; int kEnd = (bndEdge) ? 4 : 6; if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) ) { int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 }; for ( int i = 1; i < kEnd; i++ ) { oneRingIndex[i] = centerOffset + i - 1; } edgePos0 = edgePos1 = Vector4D(0,0,0,0); for ( int k = 0; k < kEnd; k++ ) { int idx = ( k + minOneRingOffset ) % kEnd; // Offset to min index to enforce evaluation order between neighboring patches edgePos0 += pStencil0[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ]; edgePos1 += pStencil1[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ]; } } else { float b0, b1; b1 = edgeBias0 / 32768.0, b0 = 1.0f-b1; edgePos0 = (val0 * pPos[ oneRing[0] ] + 2*b0*pPos[ oneRing[centerOffset + 0] ] + 1*b0*pPos[ oneRing[centerOffset + 1] ] + 2*pPos[ oneRing[centerOffset + 2] ] + 1*b1*pPos[ oneRing[centerOffset + 3] ] + 2*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val0 + 5.0f); b1 = edgeBias1 / 32768.0, b0 = 1.0f-b1; edgePos1 = ( 2 * pPos[ oneRing[0] ] + 1*b0*pPos[ oneRing[centerOffset + 0] ] + 2*b0*pPos[ oneRing[centerOffset + 1] ] + val1*pPos[ oneRing[centerOffset + 2] ] + 2*b1*pPos[ oneRing[centerOffset + 3] ] + 1*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val1 + 5.0f); } #ifdef _DEBUG g_DebugEdgePositions.AddToTail( edgePos0 ); g_DebugEdgePositions.AddToTail( edgePos1 ); #endif } inline void ComputeACCInteriorPosition( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, Vector4D &interiorPos ) { VPROF_BUDGET( "ComputeACCInteriorPosition", _T("SubD Rendering") ); if ( bndVtx ) { valence = valence>2 ? 2*(valence - 1) : 4*(valence - 1); } Assert( valence<=MAX_VALENCE ); float *stencil = sPosInteriorStencil[ valence ]; interiorPos = stencil[0] * pPos[ oneRing[0] ]; for ( int k = 1; k < 4; ++k ) { interiorPos += stencil[k] * pPos[ oneRing[ centerOffset + k - 1 ] ]; } #ifdef _DEBUG g_DebugInteriorPositions.AddToTail( interiorPos ); #endif } inline void ComputeACCGeometryPatchTangents( Vector4D *Pos, Vector4D *TanU, Vector4D *TanV ) { VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") ); for ( int j=0; j<3; j++ ) { for ( int i=0; i<4; i++ ) { TanU[i*3+j] = 3*( Pos[i*4+j+1] - Pos[i*4+j] ); TanV[j*4+i] = 3*( Pos[(j+1)*4+i] - Pos[j*4+i] ); } } } void ComputeACCGeometryPatch( Vector4D* pPos, TopologyIndexStruct *quad, Vector4D* Pos) { VPROF_BUDGET( "ComputeACCGeometryPatch", _T("SubD Rendering") ); int MOD4[8] = {0,1,2,3,0,1,2,3}; int accCorner[] = {0,3,15,12}; int accEdge1[] = {4,2,11,13}; int accEdge2[] = {8,1,7,14}; int accInterior[] = {5,6,10,9}; int vtx1RingStart = 0; unsigned short *oneRing = quad->oneRing; for ( int i=0; i<4; i++ ) // 4 corner vertices { ComputeCatmullClarkLimitPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingSize[i], quad->minOneRingOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->nbCornerVtx[i], Pos[ accCorner[i] ] ); ComputeACCEdgePositions( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i], quad->bndEdge[ MOD4[i+3] ], quad->bndVtx[i], quad->bndVtx[MOD4[i+3]], quad->cornerVtx[i], quad->cornerVtx[MOD4[i+3]], quad->loopGapAngle[i], quad->loopGapAngle[MOD4[i+3]], quad->edgeBias[ 2*MOD4[i+3] ], quad->edgeBias[ 2*MOD4[i+3] + 1 ], quad->valences[i], quad->valences[MOD4[i+3]], quad->minOneRingOffset[i], quad->vtx1RingSize[i], Pos[accEdge1[i]], Pos[accEdge2[i]] ); ComputeACCInteriorPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->loopGapAngle[i], quad->valences[i], Pos[ accInterior[i] ] ); vtx1RingStart += quad->vtx1RingSize[i]; } } void ComputeACCTangentPatches( Vector4D* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV ) { VPROF_BUDGET( "ComputeACCTangentPatches", _T("SubD Rendering") ); int MOD4[8] = {0,1,2,3,0,1,2,3}; int accTanCornerU[] = {0,2,11,9}; // counterclockwise orders! int accTanCornerV[] = {0,3,11,8}; unsigned short *oneRing = quad->oneRing; ComputeACCGeometryPatchTangents(Pos, TanU, TanV); #if !defined( NO_TANGENTS ) if ( !sShowACCGeometryTangents ) { // compute corner tangents ( = subdivision surface limit tangents) int vtx1RingStart = 0; for ( int i=0; i<4; i++ ) { int vtx1RingSize = quad->vtx1RingSize[i]; Vector4D &accTanU = TanU[ accTanCornerU[i] ]; Vector4D &accTanV = TanV[ accTanCornerV[i] ]; ComputeCatmullClarkLimitTangents(i, pPos, &oneRing[vtx1RingStart], vtx1RingSize, quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->loopGapAngle[i], accTanU, accTanV ); vtx1RingStart += vtx1RingSize; } // compute correction component to boundary tangents for tangent plane continuity // /TanV/ /TanU/ / TanV / /TanU/ static int CB_CornerIdx[] = {0,1,2, 3,7,11, 11,10,9, 8,4,0 }; static int CB_InteriorIdx[] = {1,2, 5,8, 10,9, 6,3 }; static float CB_sign[] = {1,-1,1,-1}; for ( int i=0; i<4; i++ ) // for all quad edges { if ( !quad->bndEdge[i] ) { Vector4D *CBTanV = (i&1) ? TanU : TanV; Vector4D *CBTanU = (i&1) ? TanV : TanU; Vector4D u00 = CBTanU[CB_CornerIdx[3*i + 0]]; Vector4D u10 = CBTanU[CB_CornerIdx[3*i + 1]]; Vector4D u20 = CBTanU[CB_CornerIdx[3*i + 2]]; int val0 = quad->valences[i]; int val1 = quad->valences[MOD4[i+1]]; if ( quad->bndVtx[i] ) val0--; if ( quad->bndVtx[MOD4[i+1]] ) val1--; float c0 = cos( (2*PI * quad->loopGapAngle[ i ] / 65535.0f) / val0 ); float c1 = cos( (2*PI * quad->loopGapAngle[MOD4[i+1]] / 65535.0f) / val1 ); CBTanV[ CB_InteriorIdx[2*i + 0] ] += CB_sign[i]*( 2*c0*u10 - c1*u00 )/3.0f; CBTanV[ CB_InteriorIdx[2*i + 1] ] += CB_sign[i]*( c0*u20 - 2*c1*u10 )/3.0f; } } } #endif } #endif // !defined( USE_OPT ) #if defined( USE_OPT ) #define M_PI2 6.28318530717958647692f static fltx4 Four_NegativeThirds; static fltx4 Four_Fives; static fltx4 Four_Tens; static fltx4 Four_N[32]; static fltx4 Four_TwoPI; static fltx4 Four_Valence[MAX_VALENCE]; static fltx4 Four_ValencePlus5[MAX_VALENCE]; static fltx4 sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2]; static fltx4 sPosEdge1Stencil[MAX_VALENCE+1][6]; static fltx4 sPosEdge2Stencil[MAX_VALENCE+1][6]; static fltx4 sPosInteriorStencil[MAX_VALENCE+1][4]; static fltx4 sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static fltx4 sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static fltx4 sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static fltx4 sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static fltx4 sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static fltx4 sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static fltx4 sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1]; static fltx4 sPosEdge1BndStencil[MAX_VALENCE+1][6]; static fltx4 sPosEdge2BndStencil[MAX_VALENCE+1][6]; static fltx4 sPosInteriorBndStencil[MAX_VALENCE+1][4]; static fltx4 sPosEdge1CornerStencil[MAX_VALENCE+1][6]; static fltx4 sPosEdge2CornerStencil[MAX_VALENCE+1][6]; static fltx4 sCCSinPI[MAX_VALENCE*2][MAX_VALENCE]; static fltx4 sCCCosPI[MAX_VALENCE*2][MAX_VALENCE]; static float Valence_MinusOne[MAX_VALENCE]; static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, fltx4 *stencilBuffer) { VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") ); for ( int i=0; i<2*n; ++i ) { stencilBuffer[i] = Four_Zeros; } if ( !boundary ) { float scale = 1.0f / (n*n + 5.0f*n); stencilBuffer[0] = ReplicateX4( n*n * scale ); for ( int i=0; i 0 ) { limitPos = pPos[ pOneRing[0] ]; } else { assert( valence <= MAX_VALENCE ); fltx4 *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ]; // pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer) limitPos = MulSIMD( pStencil[0], pPos[ pOneRing[0] ] ); for ( int k = 0; k < vtx1RingSize; k++ ) { int idx = ( k + minOneRingIndex ) % vtx1RingSize; // Shuffle to get the minimum index consistently first in order if ( idx != 0 ) // Don't do pStencil[0] again { limitPos = MaddSIMD( pStencil[idx], pPos[ pOneRing[idx] ], limitPos ); } } } } FORCEINLINE fltx4 VectorNormalize( fltx4 &A ) { fltx4 mag_sq = Dot3SIMD( A, A ); // length^2 fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq); return MulSIMD( A, invSqrt ); } FORCEINLINE fltx4 VectorLength( fltx4 &A ) { fltx4 mag_sq = Dot3SIMD( A, A ); // length^2 fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq); return invSqrt; } FORCEINLINE fltx4 CrossProduct( const fltx4 &A, const fltx4 &B ) { #if defined( _X360 ) return XMVector3Cross( A, B ); #elif defined( _WIN32 ) fltx4 A1 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 1, 2, 0, 3 ) ); fltx4 B1 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 2, 0, 1, 3 ) ); fltx4 Result1 = MulSIMD( A1, B1 ); fltx4 A2 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 2, 0, 1, 3 ) ); fltx4 B2 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 1, 2, 0, 3 ) ); fltx4 Result2 = MulSIMD( A2, B2 ); return SubSIMD( Result1, Result2 ); #else fltx4 CrossVal; SubFloat( CrossVal, 0 ) = SubFloat( A, 1 )*SubFloat( B, 2 ) - SubFloat( A, 2 )*SubFloat( B, 1 ); SubFloat( CrossVal, 1 ) = SubFloat( A, 2 )*SubFloat( B, 0 ) - SubFloat( A, 0 )*SubFloat( B, 2 ); SubFloat( CrossVal, 2 ) = SubFloat( A, 0 )*SubFloat( B, 1 ) - SubFloat( A, 1 )*SubFloat( B, 0 ); SubFloat( CrossVal, 3 ) = 0; return CrossVal; #endif } FORCEINLINE void ComputeCatmullClarkLimitTangents( int idx, fltx4 *pPos, unsigned short *pOneRing, unsigned short vtx1RingSize, unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx, unsigned short valence, float &loopGapAngle, fltx4 &limitTanU, fltx4 &limitTanV ) { VPROF_BUDGET( "ComputeCatmullClarkLimitTangents (SIMD)", _T( "SubD Rendering" ) ); // for valence=1, no need to have separate tangents static const fltx4 tanUSign[4] = { Four_Ones, Four_NegativeOnes, Four_NegativeOnes, Four_Ones }; static const fltx4 tanVSign[4] = { Four_Ones, Four_Ones, Four_NegativeOnes, Four_NegativeOnes }; if (!sUseCornerTangents) cornerVtx = 0; // interior vertices if ( !bndVtx ) { fltx4 *pStencil0 = sCCLimitTanStencil1[ valence ]; fltx4 *pStencil1 = sCCLimitTanStencil2[ valence ]; limitTanU = limitTanV = Four_Zeros; for ( int k = 0; k < vtx1RingSize; k++ ) { limitTanU = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], limitTanU ); limitTanV = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], limitTanV ); } } else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) ) { // smooth boundary vertices fltx4 *pStencil0 = sCCLimitTanBndStencil1[ valence ]; fltx4 *pStencil1 = sCCLimitTanBndStencil2[ valence ]; fltx4 r0 = Four_Zeros; fltx4 r1 = Four_Zeros; for (int k = 0; k < vtx1RingSize; ++k) { r0 = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], r0 ); r1 = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], r1 ); } int j1 = ( centerOffset - 1 ) / 2; int j2 = j1 + 1; int k = valence - 1; if ( valence == 2 ) { limitTanU = r0; limitTanV = r1; } else { limitTanU = AddSIMD( MulSIMD( sCCCosPI[j1][k], r0 ), MulSIMD( sCCSinPI[j1][k], r1 ) ); limitTanV = AddSIMD( MulSIMD( sCCCosPI[j2][k], r0 ), MulSIMD( sCCSinPI[j2][k], r1 ) ); } } else { // Corner vertices if ( valence == 2 ) return; fltx4 *pEdgeStencil = sPosEdge1Stencil[ valence ]; // Compute tangents fltx4 c0 = SubSIMD( pPos[ pOneRing[ 1 ] ], pPos[ pOneRing[ 0 ] ] ); fltx4 c1 = SubSIMD( pPos[ pOneRing[ vtx1RingSize - 1 ] ], pPos[ pOneRing[ 0 ] ] ); fltx4 e0 = MulSIMD( SubSIMD( pEdgeStencil[0], Four_Ones ), pPos[ pOneRing[ 0 ] ] ); fltx4 e1 = e0; for ( int k = 1; k < 6; k++ ) { e0 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ k ] ], e0 ); e1 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ vtx1RingSize - 6 + k ] ], e1 ); } // Compute average tangent plane normal fltx4 n0 = CrossProduct( c0, e0 ); n0 = VectorNormalize( n0 ); fltx4 n1 = CrossProduct( e1, c1 ); n1 = VectorNormalize( n1 ); fltx4 N = AddSIMD( n0, n1 ); N = VectorNormalize( N ); // Project into tangent plane fltx4 DotC0N = Dot3SIMD( c0, N ); fltx4 DotC1N = Dot3SIMD( c1, N ); c0 = SubSIMD( c0, MulSIMD( DotC0N, N ) ); c1 = SubSIMD( c1, MulSIMD( DotC1N, N ) ); fltx4 c0l = VectorLength( c0 ); c0 = DivSIMD( c0, c0l ); fltx4 c1l = VectorLength( c1 ); c1 = DivSIMD( c1, c1l ); fltx4 cAvg = MulSIMD( AddSIMD(c0l,c1l), Four_PointFives ); // Compute angle fltx4 c0p = CrossProduct(N, c0); fltx4 dot1 = Dot3SIMD(c0p, c1); fltx4 dot2 = Dot3SIMD(c0, c1); float angle = PI - atan2( SubFloat( dot1, 0 ), -SubFloat( dot2, 0 ) ); loopGapAngle = angle; // Compute final tangent vector int j1 = ( centerOffset - 1 ) / 2; int j2 = j1 + 1; int K = (valence - 1); static float fK[MAX_VALENCE] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f }; // Compute final tangent vector float flK = fK[K]; fltx4 Cos0 = ReplicateX4( cos( angle*j1 / flK ) ); fltx4 Sin0 = ReplicateX4( sin( angle*j1 / flK ) ); fltx4 Cos1 = ReplicateX4( cos( angle*j2 / flK ) ); fltx4 Sin1 = ReplicateX4( sin( angle*j2 / flK ) ); limitTanU = cAvg * ( Cos0 * c0 + Sin0 * c0p ); limitTanV = cAvg * ( Cos1 * c0 + Sin1 * c0p ); } // Flip tangents so they point in u/v direction if ( idx & 1 ) { V_swap( limitTanU, limitTanV ); } limitTanU = MulSIMD( limitTanU, tanUSign[idx] ); limitTanV = MulSIMD( limitTanV, tanVSign[idx] ); } FORCEINLINE void ComputeACCEdgePositions( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1, unsigned short cornerVtx0, unsigned short cornerVtx1, unsigned short edgeBias0, unsigned short edgeBias1, unsigned short val0, unsigned short val1, unsigned short minOneRingOffset, unsigned short vtx1RingSize, fltx4 &edgePos0, fltx4 &edgePos1) { VPROF_BUDGET( "ComputeACCEdgePositions (SIMD)", _T("SubD Rendering") ); if ( bndVtx0 ) { val0 = 2*(val0 - 1); } if ( bndVtx1 ) { val1 = 2*(val1 - 1); } Assert( val0 <= MAX_VALENCE ); Assert( val1 <= MAX_VALENCE ); fltx4 *pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ]; fltx4 *pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ]; int kEnd = (bndEdge) ? 4 : 6; if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) ) { int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 }; for ( int i = 1; i < kEnd; i++ ) { oneRingIndex[i] = centerOffset + i - 1; } edgePos0 = edgePos1 = Four_Zeros; for ( int k = 0; k < kEnd; k++ ) { int idx = ( k + minOneRingOffset ) % kEnd; // Offset to min index to enforce evaluation order between neighboring patches edgePos0 = MaddSIMD( pStencil0[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos0 ); edgePos1 = MaddSIMD( pStencil1[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos1 ); } } else { fltx4 b0, b1; b1 = ReplicateX4( edgeBias0 / 32768.0f ); b0 = SubSIMD( Four_Ones, b1 ); edgePos0 = DivSIMD( ( Four_Valence[val0]*pPos[ oneRing[0] ] + Four_Twos*b0*pPos[ oneRing[ centerOffset] ] + b0*pPos[ oneRing[centerOffset + 1] ] + Four_Twos*pPos[ oneRing[centerOffset + 2] ] + b1*pPos[ oneRing[centerOffset + 3] ] + Four_Twos*b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] ); b1 = ReplicateX4( edgeBias1 / 32768.0f ); b0 = SubSIMD( Four_Ones, b1 ); edgePos1 = DivSIMD( ( Four_Twos*pPos[ oneRing[0] ] + b0*pPos[ oneRing[centerOffset + 0] ] + Four_Twos*b0*pPos[ oneRing[centerOffset + 1] ] + Four_Valence[val1]*pPos[ oneRing[centerOffset + 2] ] + Four_Twos*b1*pPos[ oneRing[centerOffset + 3] ] + b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] ); } } FORCEINLINE void ComputeACCInteriorPosition( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, fltx4 &interiorPos ) { VPROF_BUDGET( "ComputeACCInteriorPosition (SIMD)", _T( "SubD Rendering" ) ); if ( bndVtx ) { valence = valence > 2 ? 2 * (valence - 1) : 4 * (valence - 1); } Assert( valence <= MAX_VALENCE ); fltx4 *pStencil = sPosInteriorStencil[ valence ]; interiorPos = MulSIMD( pStencil[0], pPos[ oneRing[0] ] ); for ( int k = 1; k < 4; k++ ) { interiorPos = MaddSIMD( pStencil[k], pPos[ oneRing[ centerOffset + k - 1 ] ], interiorPos ); } } FORCEINLINE void ComputeACCGeometryPatchTangents( fltx4 *Pos, fltx4 *TanU, fltx4 *TanV ) { //VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") ); TanU[0] = MulSIMD( Four_Threes, SubSIMD( Pos[1], Pos[0] ) ); TanV[0] = MulSIMD( Four_Threes, SubSIMD( Pos[4], Pos[0] ) ); TanU[3] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[4] ) ); TanV[1] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[1] ) ); TanU[6] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[8] ) ); TanV[2] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[2] ) ); TanU[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[12] ) ); TanV[3] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[3] ) ); TanU[1] = MulSIMD( Four_Threes, SubSIMD( Pos[2], Pos[1] ) ); TanV[4] = MulSIMD( Four_Threes, SubSIMD( Pos[8], Pos[4] ) ); TanU[4] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[5] ) ); TanV[5] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[5] ) ); TanU[7] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[9] ) ); TanV[6] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[6] ) ); TanU[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[13] ) ); TanV[7] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[7] ) ); TanU[2] = MulSIMD( Four_Threes, SubSIMD( Pos[3], Pos[2] ) ); TanV[8] = MulSIMD( Four_Threes, SubSIMD( Pos[12], Pos[8] ) ); TanU[5] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[6] ) ); TanV[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[9] ) ); TanU[8] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[10] ) ); TanV[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[10] ) ); TanU[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[14] ) ); TanV[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[11] ) ); } void ComputeACCAllPatches( fltx4* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV, bool bRegularPatch ) { VPROF_BUDGET( "ComputeACCAllPatches (SIMD)", _T( "SubD Rendering" ) ); int accCorner[] = { 0, 3, 15, 12 }; int accEdge1[] = { 4, 2, 11, 13 }; int accEdge2[] = { 8, 1, 7, 14 }; int accInterior[] = { 5, 6, 10, 9 }; int accTanCornerU[] = { 0, 2, 11, 9 }; // counterclockwise orders! int accTanCornerV[] = { 0, 3, 11, 8 }; fltx4 OutPos[16], OutTanU[16], OutTanV[16]; // Point to four one-rings int vtx1RingStart = 0; unsigned short* pOneRing[4]; for ( int i = 0; i < 4; i++ ) { unsigned short vtx1RingSize = quad->vtx1RingSize[i]; pOneRing[i] = &(quad->oneRing[vtx1RingStart]); vtx1RingStart += vtx1RingSize; } { VPROF_BUDGET( "ComputeACCAllPatches - Geometry Control Points (SIMD)", _T( "SubD Rendering" ) ); ComputeCatmullClarkLimitPosition( pPos, pOneRing[0], quad->vtx1RingSize[0], quad->minOneRingOffset[0], quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], OutPos[ accCorner[0] ] ); ComputeCatmullClarkLimitPosition( pPos, pOneRing[1], quad->vtx1RingSize[1], quad->minOneRingOffset[1], quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], OutPos[ accCorner[1] ] ); ComputeCatmullClarkLimitPosition( pPos, pOneRing[2], quad->vtx1RingSize[2], quad->minOneRingOffset[2], quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], OutPos[ accCorner[2] ] ); ComputeCatmullClarkLimitPosition( pPos, pOneRing[3], quad->vtx1RingSize[3], quad->minOneRingOffset[3], quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], OutPos[ accCorner[3] ] ); ComputeACCEdgePositions( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0], quad->bndEdge[3], quad->bndVtx[0], quad->bndVtx[3], quad->cornerVtx[0], quad->cornerVtx[3], quad->edgeBias[6], quad->edgeBias[7], quad->valences[0], quad->valences[3], quad->minOneRingOffset[0], quad->vtx1RingSize[0], OutPos[accEdge1[0]], OutPos[accEdge2[0]] ); ComputeACCEdgePositions( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1], quad->bndEdge[0], quad->bndVtx[1], quad->bndVtx[0], quad->cornerVtx[1], quad->cornerVtx[0], quad->edgeBias[0], quad->edgeBias[1], quad->valences[1], quad->valences[0], quad->minOneRingOffset[1], quad->vtx1RingSize[1], OutPos[accEdge1[1]], OutPos[accEdge2[1]] ); ComputeACCEdgePositions( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2], quad->bndEdge[1], quad->bndVtx[2], quad->bndVtx[1], quad->cornerVtx[2], quad->cornerVtx[1], quad->edgeBias[2], quad->edgeBias[3], quad->valences[2], quad->valences[1], quad->minOneRingOffset[2], quad->vtx1RingSize[2], OutPos[accEdge1[2]], OutPos[accEdge2[2]] ); ComputeACCEdgePositions( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3], quad->bndEdge[2], quad->bndVtx[3], quad->bndVtx[2], quad->cornerVtx[3], quad->cornerVtx[2], quad->edgeBias[4], quad->edgeBias[5], quad->valences[3], quad->valences[2], quad->minOneRingOffset[3], quad->vtx1RingSize[3], OutPos[accEdge1[3]], OutPos[accEdge2[3]] ); ComputeACCInteriorPosition( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0], quad->bndVtx[0], quad->valences[0], OutPos[ accInterior[0] ] ); ComputeACCInteriorPosition( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1], quad->bndVtx[1], quad->valences[1], OutPos[ accInterior[1] ] ); ComputeACCInteriorPosition( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2], quad->bndVtx[2], quad->valences[2], OutPos[ accInterior[2] ] ); ComputeACCInteriorPosition( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3], quad->bndVtx[3], quad->valences[3], OutPos[ accInterior[3] ] ); } #if !defined( NO_TANGENTS ) // Don't compute tangents for regular patches #if defined( SEPARATE_REGULAR_AND_EXTRA ) if ( !bRegularPatch ) #endif { VPROF_BUDGET( "ComputeACCAllPatches - Tangents (SIMD)", _T( "SubD Rendering" ) ); ComputeACCGeometryPatchTangents( OutPos, OutTanU, OutTanV ); float flLoopGap[4]; flLoopGap[0] = ( M_PI2 * quad->loopGapAngle[0] ) / 65535.0f; flLoopGap[1] = ( M_PI2 * quad->loopGapAngle[1] ) / 65535.0f; flLoopGap[2] = ( M_PI2 * quad->loopGapAngle[2] ) / 65535.0f; flLoopGap[3] = ( M_PI2 * quad->loopGapAngle[3] ) / 65535.0f; if ( !sShowACCGeometryTangents ) { { ComputeCatmullClarkLimitTangents( 0, pPos, pOneRing[0], quad->vtx1RingSize[0], quad->vtx1RingCenterQuadOffset[0], quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], flLoopGap[0], OutTanU[ accTanCornerU[0] ], OutTanV[ accTanCornerV[0] ] ); ComputeCatmullClarkLimitTangents( 1, pPos, pOneRing[1], quad->vtx1RingSize[1], quad->vtx1RingCenterQuadOffset[1], quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], flLoopGap[1], OutTanU[ accTanCornerU[1] ], OutTanV[ accTanCornerV[1] ] ); ComputeCatmullClarkLimitTangents( 2, pPos, pOneRing[2], quad->vtx1RingSize[2], quad->vtx1RingCenterQuadOffset[2], quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], flLoopGap[2], OutTanU[ accTanCornerU[2] ], OutTanV[ accTanCornerV[2] ] ); ComputeCatmullClarkLimitTangents( 3, pPos, pOneRing[3], quad->vtx1RingSize[3], quad->vtx1RingCenterQuadOffset[3], quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], flLoopGap[3], OutTanU[ accTanCornerU[3] ], OutTanV[ accTanCornerV[3] ] ); } // compute correction component to boundary tangents for tangent plane continuity // /TanV/ /TanU/ / TanV / /TanU/ static int CB_CornerIdx[] = {0,1,2, 3,7,11, 11,10,9, 8,4,0 }; static int CB_InteriorIdx[] = {1,2, 5,8, 10,9, 6,3 }; static fltx4 CB_sign[4] = {Four_Ones,Four_NegativeOnes,Four_Ones,Four_NegativeOnes}; { // Unroll, since the compiler wants to keep it rolled, and we get better perf unrolled { fltx4 u00 = OutTanU[CB_CornerIdx[0]]; fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[1]], Four_Twos ); fltx4 u20 = OutTanU[CB_CornerIdx[2]]; int val0 = quad->valences[0]; int val1 = quad->valences[1]; if ( quad->bndVtx[0] ) val0--; if ( quad->bndVtx[1] ) val1--; fltx4 c0 = ReplicateX4( cosf( (flLoopGap[0]) / val0 ) ); fltx4 c1 = ReplicateX4( cosf( (flLoopGap[1]) / val1 ) ); fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 ); fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes ); OutTanV[CB_InteriorIdx[0] ] = AddSIMD( OutTanV[CB_InteriorIdx[0] ], E ); OutTanV[CB_InteriorIdx[1] ] = AddSIMD( OutTanV[CB_InteriorIdx[1] ], F ); } { fltx4 u00 = OutTanV[CB_CornerIdx[3]]; fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[4]], Four_Twos ); fltx4 u20 = OutTanV[CB_CornerIdx[5]]; int val0 = quad->valences[1]; int val1 = quad->valences[2]; if ( quad->bndVtx[1] ) val0--; if ( quad->bndVtx[2] ) val1--; fltx4 c0 = ReplicateX4( cosf( (flLoopGap[1]) / val0 ) ); fltx4 c1 = ReplicateX4( cosf( (flLoopGap[2]) / val1 ) ); fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 ); fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes ); OutTanU[CB_InteriorIdx[2] ] = SubSIMD( OutTanU[CB_InteriorIdx[2] ], E ); OutTanU[CB_InteriorIdx[3] ] = SubSIMD( OutTanU[CB_InteriorIdx[3] ], F ); } { fltx4 u00 = OutTanU[CB_CornerIdx[6]]; fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[7]], Four_Twos ); fltx4 u20 = OutTanU[CB_CornerIdx[8]]; int val0 = quad->valences[2]; int val1 = quad->valences[3]; if ( quad->bndVtx[2] ) val0--; if ( quad->bndVtx[3] ) val1--; fltx4 c0 = ReplicateX4( cosf( (flLoopGap[2]) / val0 ) ); fltx4 c1 = ReplicateX4( cosf( (flLoopGap[3]) / val1 ) ); fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 ); fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes ); OutTanV[CB_InteriorIdx[4] ] = AddSIMD( OutTanV[CB_InteriorIdx[4] ], E ); OutTanV[CB_InteriorIdx[5] ] = AddSIMD( OutTanV[CB_InteriorIdx[5] ], F ); } { fltx4 u00 = OutTanV[CB_CornerIdx[9]]; fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[10]], Four_Twos ); fltx4 u20 = OutTanV[CB_CornerIdx[11]]; int val0 = quad->valences[3]; int val1 = quad->valences[0]; if ( quad->bndVtx[3] ) val0--; if ( quad->bndVtx[0] ) val1--; fltx4 c0 = ReplicateX4( cosf( (flLoopGap[3]) / val0 ) ); fltx4 c1 = ReplicateX4( cosf( (flLoopGap[0]) / val1 ) ); fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 ); fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes ); OutTanU[CB_InteriorIdx[6] ] = SubSIMD( OutTanU[CB_InteriorIdx[6] ], E ); OutTanU[CB_InteriorIdx[7] ] = SubSIMD( OutTanU[CB_InteriorIdx[7] ], F ); } } } StoreAlignedSIMD( (float*)&TanU[0], OutTanU[0] ); StoreAlignedSIMD( (float*)&TanU[1], OutTanU[1] ); StoreAlignedSIMD( (float*)&TanU[2], OutTanU[2] ); StoreAlignedSIMD( (float*)&TanU[3], OutTanU[3] ); StoreAlignedSIMD( (float*)&TanU[4], OutTanU[4] ); StoreAlignedSIMD( (float*)&TanU[5], OutTanU[5] ); StoreAlignedSIMD( (float*)&TanU[6], OutTanU[6] ); StoreAlignedSIMD( (float*)&TanU[7], OutTanU[7] ); StoreAlignedSIMD( (float*)&TanU[8], OutTanU[8] ); StoreAlignedSIMD( (float*)&TanU[9], OutTanU[9] ); StoreAlignedSIMD( (float*)&TanU[10], OutTanU[10] ); StoreAlignedSIMD( (float*)&TanU[11], OutTanU[11] ); StoreAlignedSIMD( (float*)&TanV[0], OutTanV[0] ); StoreAlignedSIMD( (float*)&TanV[1], OutTanV[1] ); StoreAlignedSIMD( (float*)&TanV[2], OutTanV[2] ); StoreAlignedSIMD( (float*)&TanV[3], OutTanV[3] ); StoreAlignedSIMD( (float*)&TanV[4], OutTanV[4] ); StoreAlignedSIMD( (float*)&TanV[5], OutTanV[5] ); StoreAlignedSIMD( (float*)&TanV[6], OutTanV[6] ); StoreAlignedSIMD( (float*)&TanV[7], OutTanV[7] ); StoreAlignedSIMD( (float*)&TanV[8], OutTanV[8] ); StoreAlignedSIMD( (float*)&TanV[9], OutTanV[9] ); StoreAlignedSIMD( (float*)&TanV[10], OutTanV[10] ); StoreAlignedSIMD( (float*)&TanV[11], OutTanV[11] ); } #endif StoreAlignedSIMD( (float*)&Pos[0], OutPos[0] ); StoreAlignedSIMD( (float*)&Pos[1], OutPos[1] ); StoreAlignedSIMD( (float*)&Pos[2], OutPos[2] ); StoreAlignedSIMD( (float*)&Pos[3], OutPos[3] ); StoreAlignedSIMD( (float*)&Pos[4], OutPos[4] ); StoreAlignedSIMD( (float*)&Pos[5], OutPos[5] ); StoreAlignedSIMD( (float*)&Pos[6], OutPos[6] ); StoreAlignedSIMD( (float*)&Pos[7], OutPos[7] ); StoreAlignedSIMD( (float*)&Pos[8], OutPos[8] ); StoreAlignedSIMD( (float*)&Pos[9], OutPos[9] ); StoreAlignedSIMD( (float*)&Pos[10], OutPos[10] ); StoreAlignedSIMD( (float*)&Pos[11], OutPos[11] ); StoreAlignedSIMD( (float*)&Pos[12], OutPos[12] ); StoreAlignedSIMD( (float*)&Pos[13], OutPos[13] ); StoreAlignedSIMD( (float*)&Pos[14], OutPos[14] ); StoreAlignedSIMD( (float*)&Pos[15], OutPos[15] ); } #endif