//===== Copyright (c) 1996-2008, Valve Corporation, All rights reserved. ======//
//
// Purpose: Support for mapping from a quad mesh to Bicubic Patches, as a means
//          of rendering approximate Catmull-Clark subdivision surfaces
//
//===========================================================================//

#include "studio.h"
#include "studiorendercontext.h"
#include "materialsystem/imaterialsystem.h"
#include "materialsystem/imaterial.h"
#include "materialsystem/imaterialvar.h"
#include "materialsystem/itexture.h"
#include "materialsystem/imesh.h"
#include "mathlib/mathlib.h"
#include "studiorender.h"
#include "optimize.h"
#include "tier1/convar.h"
#include "tier1/keyvalues.h"
#include "tier0/vprof.h"

// memdbgon must be the last include file in a .cpp file!!!
#include "tier0/memdbgon.h"

#define  R_STUDIOSUBD 
#include "r_studiosubd_patches.h"

#ifdef _DEBUG
// Temporary debug arrays
extern CUtlVector<Vector4D> g_DebugCornerPositions;
extern CUtlVector<Vector4D> g_DebugEdgePositions;
extern CUtlVector<Vector4D> g_DebugInteriorPositions;
#endif

//
//  Check out CL# 584588 for an SSE-ized version of the older versions of these
//	routines, which came from an older MS doc, by way of the DX10 SDK
//

static void R_TransformVert( const Vector *pSrcPos,	matrix3x4_t *pSkinMat, Vector4DAligned &pos )
{
	VPROF_BUDGET( "R_TransformVert", _T("SubD Rendering") );

	// NOTE: Could add SSE stuff here, if we knew what SSE stuff could make it faster
	pos.x  = pSrcPos->x  * (*pSkinMat)[0][0] + pSrcPos->y  * (*pSkinMat)[0][1] + pSrcPos->z  * (*pSkinMat)[0][2] + (*pSkinMat)[0][3];
	pos.y  = pSrcPos->x  * (*pSkinMat)[1][0] + pSrcPos->y  * (*pSkinMat)[1][1] + pSrcPos->z  * (*pSkinMat)[1][2] + (*pSkinMat)[1][3];
	pos.z  = pSrcPos->x  * (*pSkinMat)[2][0] + pSrcPos->y  * (*pSkinMat)[2][1] + pSrcPos->z  * (*pSkinMat)[2][2] + (*pSkinMat)[2][3];
	pos.w = 1.0f;
}


// This function is duplicate code ****
static matrix3x4_t *ComputeSkinMatrixSSE( mstudioboneweight_t &boneweights, matrix3x4_t *pPoseToWorld, matrix3x4_t &scratchMatrix )
{
	VPROF_BUDGET( "ComputeSkinMatrixSSE", _T("SubD Rendering") );

	// NOTE: pPoseToWorld, being cache aligned, doesn't need explicit initialization
#if defined( _WIN32 ) && !defined( WIN64 ) && !defined( _X360 ) 
	switch( boneweights.numbones )
	{
	default:
	case 1:
		return &pPoseToWorld[boneweights.bone[0]];

	case 2:
		{
			matrix3x4_t &boneMat0 = pPoseToWorld[boneweights.bone[0]];
			matrix3x4_t &boneMat1 = pPoseToWorld[boneweights.bone[1]];
			float *pWeights = boneweights.weight;

			_asm
			{
				mov		eax, DWORD PTR [pWeights]
				movss	xmm6, dword ptr[eax]		; boneweights.weight[0]
				movss	xmm7, dword ptr[eax + 4]	; boneweights.weight[1]

				mov		eax, DWORD PTR [boneMat0]
				mov		ecx, DWORD PTR [boneMat1]
				mov		edi, DWORD PTR [scratchMatrix]

				// Fill xmm6, and 7 with all the bone weights
				shufps	xmm6, xmm6, 0
				shufps	xmm7, xmm7, 0

				// Load up all rows of the three matrices
				movaps	xmm0, XMMWORD PTR [eax]
				movaps	xmm1, XMMWORD PTR [ecx]
				movaps	xmm2, XMMWORD PTR [eax + 16]
				movaps	xmm3, XMMWORD PTR [ecx + 16]
				movaps	xmm4, XMMWORD PTR [eax + 32]
				movaps	xmm5, XMMWORD PTR [ecx + 32]

				// Multiply the rows by the weights
				mulps	xmm0, xmm6
				mulps	xmm1, xmm7
				mulps	xmm2, xmm6
				mulps	xmm3, xmm7
				mulps	xmm4, xmm6
				mulps	xmm5, xmm7

				addps	xmm0, xmm1
				addps	xmm2, xmm3
				addps	xmm4, xmm5

				movaps	XMMWORD PTR [edi], xmm0
				movaps	XMMWORD PTR [edi + 16], xmm2
				movaps	XMMWORD PTR [edi + 32], xmm4
			}
		}
		return &scratchMatrix;

	case 3:
		{
			matrix3x4_t &boneMat0 = pPoseToWorld[boneweights.bone[0]];
			matrix3x4_t &boneMat1 = pPoseToWorld[boneweights.bone[1]];
			matrix3x4_t &boneMat2 = pPoseToWorld[boneweights.bone[2]];
			float *pWeights = boneweights.weight;

			_asm
			{
				mov		eax, DWORD PTR [pWeights]
				movss	xmm5, dword ptr[eax]		; boneweights.weight[0]
				movss	xmm6, dword ptr[eax + 4]	; boneweights.weight[1]
				movss	xmm7, dword ptr[eax + 8]	; boneweights.weight[2]

				mov		eax, DWORD PTR [boneMat0]
				mov		ecx, DWORD PTR [boneMat1]
				mov		edx, DWORD PTR [boneMat2]
				mov		edi, DWORD PTR [scratchMatrix]

				// Fill xmm5, 6, and 7 with all the bone weights
				shufps	xmm5, xmm5, 0
				shufps	xmm6, xmm6, 0
				shufps	xmm7, xmm7, 0

				// Load up the first row of the three matrices
				movaps	xmm0, XMMWORD PTR [eax]
				movaps	xmm1, XMMWORD PTR [ecx]
				movaps	xmm2, XMMWORD PTR [edx]

				// Multiply the rows by the weights
				mulps	xmm0, xmm5
				mulps	xmm1, xmm6
				mulps	xmm2, xmm7

				addps	xmm0, xmm1
				addps	xmm0, xmm2
				movaps	XMMWORD PTR [edi], xmm0

				// Load up the second row of the three matrices
				movaps	xmm0, XMMWORD PTR [eax + 16]
				movaps	xmm1, XMMWORD PTR [ecx + 16]
				movaps	xmm2, XMMWORD PTR [edx + 16]

				// Multiply the rows by the weights
				mulps	xmm0, xmm5
				mulps	xmm1, xmm6
				mulps	xmm2, xmm7

				addps	xmm0, xmm1
				addps	xmm0, xmm2
				movaps	XMMWORD PTR [edi + 16], xmm0	

				// Load up the third row of the three matrices
				movaps	xmm0, XMMWORD PTR [eax + 32]
				movaps	xmm1, XMMWORD PTR [ecx + 32]
				movaps	xmm2, XMMWORD PTR [edx + 32]

				// Multiply the rows by the weights
				mulps	xmm0, xmm5
				mulps	xmm1, xmm6
				mulps	xmm2, xmm7

				addps	xmm0, xmm1
				addps	xmm0, xmm2
				movaps	XMMWORD PTR [edi + 32], xmm0	
			}
		}
		return &scratchMatrix;

	case 4:
		{
			matrix3x4_t &boneMat0 = pPoseToWorld[boneweights.bone[0]];
			matrix3x4_t &boneMat1 = pPoseToWorld[boneweights.bone[1]];
			matrix3x4_t &boneMat2 = pPoseToWorld[boneweights.bone[2]];
			matrix3x4_t &boneMat3 = pPoseToWorld[boneweights.bone[3]];
			float *pWeights = boneweights.weight;

			_asm
			{
				mov		eax, DWORD PTR [pWeights]
				movss	xmm4, dword ptr[eax]		; boneweights.weight[0]
				movss	xmm5, dword ptr[eax + 4]	; boneweights.weight[1]
				movss	xmm6, dword ptr[eax + 8]	; boneweights.weight[2]
				movss	xmm7, dword ptr[eax + 12]	; boneweights.weight[3]

				mov		eax, DWORD PTR [boneMat0]
				mov		ecx, DWORD PTR [boneMat1]
				mov		edx, DWORD PTR [boneMat2]
				mov		esi, DWORD PTR [boneMat3]
				mov		edi, DWORD PTR [scratchMatrix]

				// Fill xmm5, 6, and 7 with all the bone weights
				shufps	xmm4, xmm4, 0
				shufps	xmm5, xmm5, 0
				shufps	xmm6, xmm6, 0
				shufps	xmm7, xmm7, 0

				// Load up the first row of the four matrices
				movaps	xmm0, XMMWORD PTR [eax]
				movaps	xmm1, XMMWORD PTR [ecx]
				movaps	xmm2, XMMWORD PTR [edx]
				movaps	xmm3, XMMWORD PTR [esi]

				// Multiply the rows by the weights
				mulps	xmm0, xmm4
				mulps	xmm1, xmm5
				mulps	xmm2, xmm6
				mulps	xmm3, xmm7

				addps	xmm0, xmm1
				addps	xmm2, xmm3
				addps	xmm0, xmm2
				movaps	XMMWORD PTR [edi], xmm0

				// Load up the second row of the three matrices
				movaps	xmm0, XMMWORD PTR [eax + 16]
				movaps	xmm1, XMMWORD PTR [ecx + 16]
				movaps	xmm2, XMMWORD PTR [edx + 16]
				movaps	xmm3, XMMWORD PTR [esi + 16]

				// Multiply the rows by the weights
				mulps	xmm0, xmm4
				mulps	xmm1, xmm5
				mulps	xmm2, xmm6
				mulps	xmm3, xmm7

				addps	xmm0, xmm1
				addps	xmm2, xmm3
				addps	xmm0, xmm2
				movaps	XMMWORD PTR [edi + 16], xmm0	

				// Load up the third row of the three matrices
				movaps	xmm0, XMMWORD PTR [eax + 32]
				movaps	xmm1, XMMWORD PTR [ecx + 32]
				movaps	xmm2, XMMWORD PTR [edx + 32]
				movaps	xmm3, XMMWORD PTR [esi + 32]

				// Multiply the rows by the weights
				mulps	xmm0, xmm4
				mulps	xmm1, xmm5
				mulps	xmm2, xmm6
				mulps	xmm3, xmm7

				addps	xmm0, xmm1
				addps	xmm2, xmm3
				addps	xmm0, xmm2
				movaps	XMMWORD PTR [edi + 32], xmm0	
			}
		}
		return &scratchMatrix;
	}
#else
#ifndef LINUX
	#pragma message( "ComputeSkinMatrixSSE C implementation only" )
#endif
	extern matrix3x4_t *ComputeSkinMatrix( mstudioboneweight_t &boneweights, matrix3x4_t *pPoseToWorld, matrix3x4_t &scratchMatrix );
	return ComputeSkinMatrix( boneweights, pPoseToWorld, scratchMatrix );
#endif

	Assert( 0 );
	return NULL;
}

#ifdef _DEBUG
static ConVar mat_tess_dump( "mat_tess_dump", "0", FCVAR_CHEAT );
#endif

void CStudioRender::SkinSubDCage( mstudiovertex_t *pVertices, int nNumVertices,
								  matrix3x4_t *pPoseToWorld, CCachedRenderData &vertexCache,
								  unsigned short* pGroupToMesh, fltx4 *vOutput, bool bDoFlex )
{
	VPROF_BUDGET( "CStudioRender::SkinSubDCage", _T("SubD Rendering") );

	Vector *pSrcPos;
	ALIGN16 matrix3x4_t *pSkinMat, temp ALIGN16_POST;

	Assert( nNumVertices > 0 );

	for ( int j=0; j < nNumVertices; ++j )
	{
		mstudiovertex_t &vert = pVertices[pGroupToMesh[j]];

		pSkinMat = ComputeSkinMatrixSSE( vert.m_BoneWeights, pPoseToWorld, temp );

		if ( bDoFlex && vertexCache.IsVertexFlexed( pGroupToMesh[j] ) )
		{
			CachedPosNormTan_t* pFlexedVertex = vertexCache.GetFlexVertex( pGroupToMesh[j] );
			pSrcPos = &pFlexedVertex->m_Position.AsVector3D();

			// Copy strange signed, 0..3 wrinkle tangent-flip encoding over to tangent.w
			pFlexedVertex->m_TangentS.w = pFlexedVertex->m_Position.w;
		}
		else // non-flexed case
		{
			pSrcPos = &vert.m_vecPosition;
		}

		// Transform into world space
		Vector4DAligned vTemp;
		R_TransformVert( pSrcPos, pSkinMat, *(Vector4DAligned*)&vTemp );
		vOutput[j] = LoadAlignedSIMD( (float *) &vTemp );
	}
}

inline unsigned short *InitializeTopologyIndexStruct( TopologyIndexStruct &quad, unsigned short *topologyIndex )
{
	quad.vtx1RingSize             = topologyIndex; topologyIndex += 4;
	quad.vtx1RingCenterQuadOffset = topologyIndex; topologyIndex += 4;
	quad.valences				  = topologyIndex; topologyIndex += 4;
	quad.minOneRingOffset		  = topologyIndex; topologyIndex += 4;
	quad.bndVtx					  = topologyIndex; topologyIndex += 4;
	quad.bndEdge				  = topologyIndex; topologyIndex += 4;
	quad.cornerVtx				  = topologyIndex; topologyIndex += 4;
	quad.loopGapAngle			  = topologyIndex; topologyIndex += 4;
	quad.nbCornerVtx			  = topologyIndex; topologyIndex += 4;
	quad.edgeBias				  = topologyIndex; topologyIndex += 8;
	quad.vUV0					  = topologyIndex; topologyIndex += 4;
	quad.vUV1					  = topologyIndex; topologyIndex += 4;
	quad.vUV2					  = topologyIndex; topologyIndex += 4;
	quad.vUV3					  = topologyIndex; topologyIndex += 4;
	quad.oneRing                  = topologyIndex; 
	topologyIndex += quad.vtx1RingSize[0]+quad.vtx1RingSize[1]+quad.vtx1RingSize[2]+quad.vtx1RingSize[3];

	return topologyIndex;
}

static ConVar mat_tessellation_update_buffers( "mat_tessellation_update_buffers", "1", FCVAR_CHEAT );
static ConVar mat_tessellation_cornertangents( "mat_tessellation_cornertangents", "1", FCVAR_CHEAT );
static ConVar mat_tessellation_accgeometrytangents( "mat_tessellation_accgeometrytangents", "0", FCVAR_CHEAT );

#ifdef _DEBUG

bool NotQuiteEqual( Vector4D &vA, Vector4D &vB )
{
	float flEpsilon = 0.05f;
	Vector4D vDelta = vA - vB;
	float flDist = sqrt( vDelta.x * vDelta.x + vDelta.y * vDelta.y + vDelta.z * vDelta.z );
	bool bSameVector = ( vA.x == vB.x ) && ( vA.y == vB.y ) && ( vA.z == vB.z );

	return ( flDist < flEpsilon ) && !bSameVector;
}

void DumpDebugPositions()
{

	for ( int i=0; i< g_DebugCornerPositions.Count(); i++ )
	{
		bool bCrack = false;
		for ( int j=0; j< g_DebugCornerPositions.Count(); j++ )
		{
			if ( NotQuiteEqual( g_DebugCornerPositions[i], g_DebugCornerPositions[j] ) )
			{
				bCrack = true;
				Assert(0);
			}
		}

		DevMsg( "%s C - %.15f, %.15f, %.15f\n", bCrack ? "*** " : "    ", g_DebugCornerPositions[i].x, g_DebugCornerPositions[i].y, g_DebugCornerPositions[i].z );
	}

	for ( int i=0; i< g_DebugEdgePositions.Count(); i++ )
	{
		bool bCrack = false;
		for ( int j=0; j< g_DebugEdgePositions.Count(); j++ )
		{
			if ( NotQuiteEqual( g_DebugEdgePositions[i], g_DebugEdgePositions[j] ) )
			{
				bCrack = true;
			}
		}

		DevMsg( "%s E - %.15f, %.15f, %.15f\n", bCrack ? "*** " : "    ", g_DebugEdgePositions[i].x, g_DebugEdgePositions[i].y, g_DebugEdgePositions[i].z );
	}

	for ( int i=0; i< g_DebugInteriorPositions.Count(); i++ )
	{
		bool bCrack = false;
		for ( int j=0; j< g_DebugInteriorPositions.Count(); j++ )
		{
			if ( NotQuiteEqual( g_DebugInteriorPositions[i], g_DebugInteriorPositions[j] ) )
			{
				bCrack = true;
			}
		}

		DevMsg( "%s I - %.15f, %.15f, %.15f\n", bCrack ? "*** " : "    ", g_DebugInteriorPositions[i].x, g_DebugInteriorPositions[i].y, g_DebugInteriorPositions[i].z );
	}
}

#endif // _DEBUG

void GenerateWorldSpacePatches( float *pSubDBuff, int nNumPatches, unsigned short *pTopologyIndices, fltx4 *pWSVertices, bool bRegularPatch )
{
	VPROF_BUDGET( "CStudioRender::GenerateWorldSpacePatches", _T("SubD Rendering") );

	TopologyIndexStruct quad;
	unsigned short *nextPatchIndices = InitializeTopologyIndexStruct( quad, pTopologyIndices );

	set_ShowACCGeometryTangents(mat_tessellation_accgeometrytangents.GetBool());
	set_UseCornerTangents(mat_tessellation_cornertangents.GetBool());

	ALIGN16 Vector4D Geo[16] ALIGN16_POST;
	ALIGN16 Vector4D TanU[12] ALIGN16_POST;
	ALIGN16 Vector4D TanV[12] ALIGN16_POST;

#ifdef _DEBUG
	if ( mat_tess_dump.GetBool() )
	{
		// Debug Arrays
		g_DebugCornerPositions.EnsureCapacity( nNumPatches * 4 );
		g_DebugEdgePositions.EnsureCapacity( nNumPatches * 8 );
		g_DebugInteriorPositions.EnsureCapacity( nNumPatches * 4 );

		// Empty the arrays this time around
		g_DebugCornerPositions.RemoveAll();
		g_DebugEdgePositions.RemoveAll();
		g_DebugInteriorPositions.RemoveAll();
	}
#endif

	for( int p = 0; p < nNumPatches; p++ )
	{
#if defined( USE_OPT )
		ComputeACCAllPatches( pWSVertices, &quad, Geo, TanU, TanV, bRegularPatch );
#else
		ComputeACCGeometryPatch( pWSVertices, &quad, Geo );
		ComputeACCTangentPatches( pWSVertices, &quad, Geo, TanU, TanV );
#endif

		for ( int i=0; i < 16; i++ )
		{
			pSubDBuff[ i * 3 + 0 ] = Geo[i].x;
			pSubDBuff[ i * 3 + 1 ] = Geo[i].y;
			pSubDBuff[ i * 3 + 2 ] = Geo[i].z;

		}

		for ( int i=0; i<12; i++ )
		{
			pSubDBuff[ i * 3 + 0 + 48 ] = TanU[ i ].x;
			pSubDBuff[ i * 3 + 1 + 48 ] = TanU[ i ].y;
			pSubDBuff[ i * 3 + 2 + 48 ] = TanU[ i ].z;
		}

		for ( int i=0; i<12; i++ )
		{
			pSubDBuff[ i * 3 + 0 + 84 ] = TanV[ i ].x;
			pSubDBuff[ i * 3 + 1 + 84 ] = TanV[ i ].y;
			pSubDBuff[ i * 3 + 2 + 84 ] = TanV[ i ].z;
		}

		pSubDBuff += 120; // 30 * sizeof( float )

		nextPatchIndices = InitializeTopologyIndexStruct( quad, nextPatchIndices );
	}

#ifdef _DEBUG
	if ( mat_tess_dump.GetBool() )
	{
		// These should be a particular size
		Assert( g_DebugCornerPositions.Count() == ( nNumPatches * 4 ) );
		Assert( g_DebugEdgePositions.Count() == ( nNumPatches * 8 ) );
		Assert( g_DebugInteriorPositions.Count() == ( nNumPatches * 4 ) );

		DumpDebugPositions();
		mat_tess_dump.SetValue( 0 );		// Turn back off
	}
#endif

}

//-----------------------------------------------------------------------------------
// Top level function for mapping a quad mesh to an array of Bicubic Bezier patches
//-----------------------------------------------------------------------------------
void CStudioRender::GenerateBicubicPatches( mstudiomesh_t* pmesh, studiomeshgroup_t* pGroup, bool bDoFlex )
{
#if defined( LINUX )
  	Assert(0);
#else
	VPROF_BUDGET( "CStudioRender::GenerateBicubicPatches", _T("SubD Rendering") );

	FillTables(); // This only does work the first time through

	Assert( pmesh );
	Assert( pGroup );

	const mstudio_meshvertexdata_t *vertData = pmesh->GetVertexData( m_pStudioHdr );
	Assert( vertData );

	mstudiovertex_t *pVertices = vertData->Vertex( 0 );

	m_vSkinnedSubDVertices.SetCount( pGroup->m_NumVertices );

	// First, apply software flexing and skinning to the vertices
	SkinSubDCage( pVertices, pGroup->m_NumVertices, m_PoseToWorld,
				  m_VertexCache, pGroup->m_pGroupIndexToMeshIndex, m_vSkinnedSubDVertices.Base(), bDoFlex );

	// Early out
	if ( mat_tessellation_update_buffers.GetBool() == false )
		return;

	// Lock the subd buffers
	int nNumPatches = 0;
	for ( int s=0; s<pGroup->m_NumStrips; ++s )
	{
		nNumPatches += pGroup->m_pUniqueFaces[s];
	}

	CMatRenderContextPtr pRenderContext( g_pMaterialSystem );
	float *pSubDBuff = pRenderContext->LockSubDBuffer( nNumPatches );

	// Now we are in world space, we can map to array of Bicubic patches
	int totalIndices = 0;
	float *pCurrentPtr = pSubDBuff;
	for ( int s=0; s<pGroup->m_NumStrips; ++s )
	{
		OptimizedModel::StripHeader_t *pStrip = &pGroup->m_pStripData[s];
		int StripFaces = pGroup->m_pUniqueFaces[s];

		GenerateWorldSpacePatches( pCurrentPtr, StripFaces, &pGroup->m_pTopologyIndices[totalIndices], m_vSkinnedSubDVertices.Base(), ( pStrip->flags & OptimizedModel::STRIP_IS_QUADLIST_REG ) != 0 );

		totalIndices += pStrip->numTopologyIndices;
		pCurrentPtr += StripFaces * 120;
	}

	// Unlock subd buffers
	pRenderContext->UnlockSubDBuffer( );

#endif // !LINUX
}


// Transform Tangent vector
static void R_TransformTangent( const Vector4D *pSrcTangentS, matrix3x4_t *pSkinMat, Vector4DAligned &tangentS )
{
	VPROF_BUDGET( "R_TransformTangent", _T("SubD Rendering") );

	tangentS.x = pSrcTangentS->x * (*pSkinMat)[0][0] + pSrcTangentS->y * (*pSkinMat)[0][1]	+ pSrcTangentS->z * (*pSkinMat)[0][2];
	tangentS.y = pSrcTangentS->x * (*pSkinMat)[1][0] + pSrcTangentS->y * (*pSkinMat)[1][1]	+ pSrcTangentS->z * (*pSkinMat)[1][2];
	tangentS.z = pSrcTangentS->x * (*pSkinMat)[2][0] + pSrcTangentS->y * (*pSkinMat)[2][1]	+ pSrcTangentS->z * (*pSkinMat)[2][2];
	tangentS.w = pSrcTangentS->w;
}

// Transforms per-vertex tangent vector, copies texture coordinates etc into dynamic VB
void CStudioRender::SoftwareProcessQuadMesh( mstudiomesh_t* pmesh, CMeshBuilder& meshBuilder, 
											 int numFaces, unsigned short* pGroupToMesh,
											 unsigned short *pTopologyIndices, bool bTangentSpace, bool bDoFlex )
{
	VPROF_BUDGET( "CStudioRender::SoftwareProcessQuadMesh", _T("SubD Rendering") );

	Vector4D *pStudioTangentS = NULL;

	ALIGN16 QuadTessVertex_t quadVertex ALIGN16_POST;

	// QuadTessVertex_t currently has the following map:
	// +-----------------------------------+
	// |  tanX  |  tanY  |  tanZ  | sBWrnk | <- Tangent in .xyz, Binormal sign flip bit plus wrinkle in .w
	// +-----------------------------------+
	// |  tcU0  |  tcV0  |  tcU1  |  tcV1  | <- Interior TC, Parametric V Edge TC
	// +-----------------------------------+
	// |  tcU2  |  tcV2  |  tcU3  |  tcV3  | <- Parametric U Edge TC, Corner TC
	// +-----------------------------------+

	quadVertex.m_vTangent.Init( 1.0f, 0.0f, 0.0f, 1.0f );

	ALIGN16 matrix3x4_t *pSkinMat, matTemp ALIGN16_POST;

	Assert( numFaces > 0 );

	const mstudio_meshvertexdata_t *pVertData = pmesh->GetVertexData( m_pStudioHdr );
	Assert( pVertData );
	if ( !pVertData )
		return;

	mstudiovertex_t *pVertices = pVertData->Vertex( 0 );


	if ( bTangentSpace )
	{
		pStudioTangentS = pVertData->TangentS( 0 );
	}

	TopologyIndexStruct quad;
	unsigned short *nextPatchIndices = InitializeTopologyIndexStruct( quad, pTopologyIndices );

	for ( int i=0; i < numFaces; ++i )						// Run over faces
	{
		int patchCorner = 0;

#if 0
		Vector4D debugTangent[4];
		for ( int j=0; j < 4; ++j )
		{
			int idx = quad.oneRing[patchCorner];
			memcpy( &debugTangent[j], &pStudioTangentS[idx], sizeof( Vector4D ) );
			patchCorner += quad.vtx1RingSize[j];
		}

		// These should be the same sign for a given patch.
		// If they're not, that's bad
		Assert( ( debugTangent[0].w == debugTangent[1].w ) &&
				( debugTangent[1].w == debugTangent[2].w ) &&
				( debugTangent[2].w == debugTangent[3].w ) );

		patchCorner = 0;
#endif

		for ( int j=0; j < 4; ++j )							// Four verts per face
		{
			int idx = quad.oneRing[patchCorner];
			mstudiovertex_t &vert = pVertices[idx];

			if ( bTangentSpace )
			{
				pSkinMat = ComputeSkinMatrixSSE( vert.m_BoneWeights, m_PoseToWorld, matTemp );

				if ( bDoFlex && m_VertexCache.IsVertexFlexed( idx ) )
				{
					CachedPosNormTan_t* pFlexedVertex = m_VertexCache.GetFlexVertex( idx );
					R_TransformTangent( &(pFlexedVertex->m_TangentS), pSkinMat, *(Vector4DAligned*)&quadVertex.m_vTangent );
				}
				else // non-flexed case
				{
					R_TransformTangent( &pStudioTangentS[idx], pSkinMat, *(Vector4DAligned*)&quadVertex.m_vTangent );
					quadVertex.m_vTangent.w *= 2; // non-flexed vertex should have wrinkle of -2 or +2
				}
			}

			// Store 4 texcoords per quad corner
			quadVertex.m_vUV01.x = pVertices[ quad.vUV0[j] ].m_vecTexCoord.x;
			quadVertex.m_vUV01.y = pVertices[ quad.vUV0[j] ].m_vecTexCoord.y;
			quadVertex.m_vUV01.z = pVertices[ quad.vUV1[j] ].m_vecTexCoord.x;
			quadVertex.m_vUV01.w = pVertices[ quad.vUV1[j] ].m_vecTexCoord.y;
			quadVertex.m_vUV23.x = pVertices[ quad.vUV2[j] ].m_vecTexCoord.x;
			quadVertex.m_vUV23.y = pVertices[ quad.vUV2[j] ].m_vecTexCoord.y;
			quadVertex.m_vUV23.z = pVertices[ quad.vUV3[j] ].m_vecTexCoord.x;
			quadVertex.m_vUV23.w = pVertices[ quad.vUV3[j] ].m_vecTexCoord.y;

			meshBuilder.FastQuadVertexSSE( quadVertex );

			patchCorner += quad.vtx1RingSize[j];
		}

		nextPatchIndices = InitializeTopologyIndexStruct( quad, nextPatchIndices );
	}

	meshBuilder.FastAdvanceNVertices( numFaces * 4 );
}