tf2/tf2_src/materialsystem/stdshaders/common_vs_fxc.h

//========= Copyright Valve Corporation, All rights reserved. ============//
//
// Purpose: This is where all common code for vertex shaders go.
//
// $NoKeywords: $
//
//===========================================================================//


#ifndef COMMON_VS_FXC_H_
#define COMMON_VS_FXC_H_

#include "common_fxc.h"

// Put global skip commands here. . make sure and check that the appropriate vars are defined
// so these aren't used on the wrong shaders!
// --------------------------------------------------------------------------------
// Ditch all fastpath attemps if we are doing LIGHTING_PREVIEW.
//	SKIP: defined $LIGHTING_PREVIEW && defined $FASTPATH && $LIGHTING_PREVIEW && $FASTPATH
// --------------------------------------------------------------------------------


#ifndef COMPRESSED_VERTS
// Default to no vertex compression
#define COMPRESSED_VERTS 0
#endif

#if ( !defined( SHADER_MODEL_VS_2_0 ) && !defined( SHADER_MODEL_VS_3_0 ) )
#if COMPRESSED_VERTS == 1
#error "Vertex compression is only for DX9 and up!"
#endif
#endif

// We're testing 2 normal compression methods
// One compressed normals+tangents into a SHORT2 each (8 bytes total)
// The other compresses them together, into a single UBYTE4 (4 bytes total)
// FIXME: pick one or the other, compare lighting quality in important cases
#define COMPRESSED_NORMALS_SEPARATETANGENTS_SHORT2	0
#define COMPRESSED_NORMALS_COMBINEDTANGENTS_UBYTE4	1
//#define COMPRESSED_NORMALS_TYPE						COMPRESSED_NORMALS_SEPARATETANGENTS_SHORT2
#define COMPRESSED_NORMALS_TYPE					COMPRESSED_NORMALS_COMBINEDTANGENTS_UBYTE4


#define FOGTYPE_RANGE				0
#define FOGTYPE_HEIGHT				1

#define COMPILE_ERROR ( 1/0; )

// -------------------------
// CONSTANTS
// -------------------------

#pragma def ( vs, c0, 0.0f, 1.0f, 2.0f, 0.5f )

const float4 cConstants1				: register(c1);
#define cOOGamma			cConstants1.x
#define cOverbright			2.0f
#define cOneThird			cConstants1.z
#define cOOOverbright		( 1.0f / 2.0f )


// The g_bLightEnabled registers and g_nLightCountRegister hold the same information regarding
// enabling lights, but callers internal to this file tend to use the loops, while external
// callers will end up using the booleans
const bool g_bLightEnabled[4]			: register(b0);
										// through b3

const int g_nLightCountRegister			: register(i0);


#define g_nLightCount					g_nLightCountRegister.x

const float4 cEyePosWaterZ				: register(c2);
#define cEyePos			cEyePosWaterZ.xyz

// Only cFlexScale.x is used
// It is a binary value used to switch on/off the addition of the flex delta stream
const float4 cFlexScale					: register( c3 );

const float4x4 cModelViewProj			: register(c4);
const float4x4 cViewProj				: register(c8);

// Used to compute projPosZ in shaders without skinning
// Using cModelViewProj with FastClip generates incorrect results
// This is just row two of the non-FastClip cModelViewProj matrix
const float4 cModelViewProjZ			: register(c12);

// More constants working back from the top...
const float4 cViewProjZ					: register(c13);

const float4 cFogParams					: register(c16);
#define cFogEndOverFogRange cFogParams.x
#define cFogOne cFogParams.y
#define cFogMaxDensity cFogParams.z
#define cOOFogRange cFogParams.w

const float4x4 cViewModel				: register(c17);

const float3 cAmbientCubeX [ 2 ] : register ( c21 ) ;
const float3 cAmbientCubeY [ 2 ] : register ( c23 ) ;
const float3 cAmbientCubeZ [ 2 ] : register ( c25 ) ;

#if defined ( SHADER_MODEL_VS_3_0 )
const float4 cFlexWeights [ 512 ] : register ( c1024 ) ;
#endif

struct LightInfo
{
	float4 color;						// {xyz} is color	w is light type code (see comment below)
	float4 dir;							// {xyz} is dir		w is light type code
	float4 pos;
	float4 spotParams;
	float4 atten;
};

// w components of color and dir indicate light type:
// 1x - directional
// 01 - spot
// 00 - point

// Four lights x 5 constants each = 20 constants
LightInfo cLightInfo[4]					: register(c27);
#define LIGHT_0_POSITION_REG					   c29

#ifdef SHADER_MODEL_VS_1_1

const float4 cModulationColor			: register(c37);

#define SHADER_SPECIFIC_CONST_0 c38
#define SHADER_SPECIFIC_CONST_1 c39
#define SHADER_SPECIFIC_CONST_2 c40
#define SHADER_SPECIFIC_CONST_3 c41
#define SHADER_SPECIFIC_CONST_4 c42
#define SHADER_SPECIFIC_CONST_5 c43
#define SHADER_SPECIFIC_CONST_6 c44
#define SHADER_SPECIFIC_CONST_7 c45
#define SHADER_SPECIFIC_CONST_8 c46
#define SHADER_SPECIFIC_CONST_9 c47
#define SHADER_SPECIFIC_CONST_10 c14
#define SHADER_SPECIFIC_CONST_11 c15

static const int cModel0Index = 48;
const float4x3 cModel[16]					: register(c48);
// last cmodel is c105 for dx80, c214 for dx90

#else // DX9 shaders (vs20 and beyond)

const float4 cModulationColor			: register( c47 );

#define SHADER_SPECIFIC_CONST_0 c48
#define SHADER_SPECIFIC_CONST_1 c49
#define SHADER_SPECIFIC_CONST_2 c50
#define SHADER_SPECIFIC_CONST_3 c51
#define SHADER_SPECIFIC_CONST_4 c52
#define SHADER_SPECIFIC_CONST_5 c53
#define SHADER_SPECIFIC_CONST_6 c54
#define SHADER_SPECIFIC_CONST_7 c55
#define SHADER_SPECIFIC_CONST_8 c56
#define SHADER_SPECIFIC_CONST_9 c57
#define SHADER_SPECIFIC_CONST_10 c14
#define SHADER_SPECIFIC_CONST_11 c15

static const int cModel0Index = 58;
const float4x3 cModel[53]				: register( c58 );
// last cmodel is c105 for dx80, c216 for dx90


#define SHADER_SPECIFIC_BOOL_CONST_0 b4
#define SHADER_SPECIFIC_BOOL_CONST_1 b5
#define SHADER_SPECIFIC_BOOL_CONST_2 b6
#define SHADER_SPECIFIC_BOOL_CONST_3 b7
#define SHADER_SPECIFIC_BOOL_CONST_4 b8
#define SHADER_SPECIFIC_BOOL_CONST_5 b9
#define SHADER_SPECIFIC_BOOL_CONST_6 b10
#define SHADER_SPECIFIC_BOOL_CONST_7 b11
#endif // vertex shader model constant packing changes


//=======================================================================================
// Methods to decompress vertex normals
//=======================================================================================

//-----------------------------------------------------------------------------------
// Decompress a normal from two-component compressed format
// We expect this data to come from a signed SHORT2 stream in the range of -32768..32767
//
// -32678 and 0 are invalid encodings
// w contains the sign to use in the cross product when generating a binormal
void _DecompressShort2Tangent( float2 inputTangent, out float4 outputTangent )
{
	float2 ztSigns		= sign( inputTangent );				// sign bits for z and tangent (+1 or -1)
	float2 xyAbs		= abs(  inputTangent );				// 1..32767
	outputTangent.xy	= (xyAbs - 16384.0f) / 16384.0f;	// x and y
	outputTangent.z		= ztSigns.x * sqrt( saturate( 1.0f - dot( outputTangent.xy, outputTangent.xy ) ) );
	outputTangent.w		= ztSigns.y;
}

//-----------------------------------------------------------------------------------
// Same code as _DecompressShort2Tangent, just one returns a float4, one a float3
void _DecompressShort2Normal( float2 inputNormal, out float3 outputNormal )
{
	float4 result;
	_DecompressShort2Tangent( inputNormal, result );
	outputNormal = result.xyz;
}

//-----------------------------------------------------------------------------------
// Decompress normal+tangent together
void _DecompressShort2NormalTangent( float2 inputNormal, float2 inputTangent, out float3 outputNormal, out float4 outputTangent )
{
	// FIXME: if we end up sticking with the SHORT2 format, pack the normal and tangent into a single SHORT4 element
	//        (that would make unpacking normal+tangent here together much cheaper than the sum of their parts)
	_DecompressShort2Normal(  inputNormal,  outputNormal  );
	_DecompressShort2Tangent( inputTangent, outputTangent );
}

//=======================================================================================
// Decompress a normal and tangent from four-component compressed format
// We expect this data to come from an unsigned UBYTE4 stream in the range of 0..255
// The final vTangent.w contains the sign to use in the cross product when generating a binormal
void _DecompressUByte4NormalTangent( float4 inputNormal,
									out float3 outputNormal,   // {nX, nY, nZ}
									out float4 outputTangent )   // {tX, tY, tZ, sign of binormal}
{
	float fOne   = 1.0f;

	float4 ztztSignBits	= ( inputNormal - 128.0f ) < 0;						// sign bits for zs and binormal (1 or 0)  set-less-than (slt) asm instruction
	float4 xyxyAbs		= abs( inputNormal - 128.0f ) - ztztSignBits;		// 0..127
	float4 xyxySignBits	= ( xyxyAbs - 64.0f ) < 0;							// sign bits for xs and ys (1 or 0)
	float4 normTan		= (abs( xyxyAbs - 64.0f ) - xyxySignBits) / 63.0f;	// abs({nX, nY, tX, tY})
	outputNormal.xy		= normTan.xy;										// abs({nX, nY, __, __})
	outputTangent.xy	= normTan.zw;										// abs({tX, tY, __, __})

	float4 xyxySigns	= 1 - 2*xyxySignBits;								// Convert sign bits to signs
	float4 ztztSigns	= 1 - 2*ztztSignBits;								// ( [1,0] -> [-1,+1] )

	outputNormal.z		= 1.0f - outputNormal.x - outputNormal.y;			// Project onto x+y+z=1
	outputNormal.xyz	= normalize( outputNormal.xyz );					// Normalize onto unit sphere
	outputNormal.xy	   *= xyxySigns.xy;										// Restore x and y signs
	outputNormal.z	   *= ztztSigns.x;										// Restore z sign

	outputTangent.z		= 1.0f - outputTangent.x - outputTangent.y;			// Project onto x+y+z=1
	outputTangent.xyz	= normalize( outputTangent.xyz );					// Normalize onto unit sphere
	outputTangent.xy   *= xyxySigns.zw;										// Restore x and y signs
	outputTangent.z	   *= ztztSigns.z;										// Restore z sign
	outputTangent.w		= ztztSigns.w;										// Binormal sign
}


//-----------------------------------------------------------------------------------
// Decompress just a normal from four-component compressed format (same as above)
// We expect this data to come from an unsigned UBYTE4 stream in the range of 0..255
// [ When compiled, this works out to approximately 17 asm instructions ]
void _DecompressUByte4Normal( float4 inputNormal,
							out float3 outputNormal)					// {nX, nY, nZ}
{
	float fOne			= 1.0f;

	float2 ztSigns		= ( inputNormal.xy - 128.0f ) < 0;				// sign bits for zs and binormal (1 or 0)  set-less-than (slt) asm instruction
	float2 xyAbs		= abs( inputNormal.xy - 128.0f ) - ztSigns;		// 0..127
	float2 xySigns		= ( xyAbs -  64.0f ) < 0;						// sign bits for xs and ys (1 or 0)
	outputNormal.xy		= ( abs( xyAbs - 64.0f ) - xySigns ) / 63.0f;	// abs({nX, nY})

	outputNormal.z		= 1.0f - outputNormal.x - outputNormal.y;		// Project onto x+y+z=1
	outputNormal.xyz	= normalize( outputNormal.xyz );				// Normalize onto unit sphere

	outputNormal.xy	   *= lerp( fOne.xx, -fOne.xx, xySigns   );			// Restore x and y signs
	outputNormal.z	   *= lerp( fOne.x,  -fOne.x,  ztSigns.x );			// Restore z sign
}


void DecompressVertex_Normal( float4 inputNormal, out float3 outputNormal )
{
	if ( COMPRESSED_VERTS == 1 )
	{
		if ( COMPRESSED_NORMALS_TYPE == COMPRESSED_NORMALS_SEPARATETANGENTS_SHORT2 )
		{
			_DecompressShort2Normal( inputNormal.xy, outputNormal );
		}
		else // ( COMPRESSED_NORMALS_TYPE == COMPRESSED_NORMALS_COMBINEDTANGENTS_UBYTE4 )
		{
			_DecompressUByte4Normal( inputNormal, outputNormal );
		}
	}
	else
	{
		outputNormal = inputNormal.xyz;
	}
}

void DecompressVertex_NormalTangent( float4 inputNormal,  float4 inputTangent, out float3 outputNormal, out float4 outputTangent )
{
	if ( COMPRESSED_VERTS == 1 )
	{
		if ( COMPRESSED_NORMALS_TYPE == COMPRESSED_NORMALS_SEPARATETANGENTS_SHORT2 )
		{
			_DecompressShort2NormalTangent( inputNormal.xy, inputTangent.xy, outputNormal, outputTangent );
		}
		else // ( COMPRESSED_NORMALS_TYPE == COMPRESSED_NORMALS_COMBINEDTANGENTS_UBYTE4 )
		{
			_DecompressUByte4NormalTangent( inputNormal, outputNormal, outputTangent );
		}
	}
	else
	{
		outputNormal  = inputNormal.xyz;
		outputTangent = inputTangent;
	}
}


#ifdef SHADER_MODEL_VS_3_0

//-----------------------------------------------------------------------------
// Methods to sample morph data from a vertex texture
// NOTE: vMorphTargetTextureDim.x = width, cVertexTextureDim.y = height, cVertexTextureDim.z = # of float4 fields per vertex
// For position + normal morph for example, there will be 2 fields.
//-----------------------------------------------------------------------------
float4 SampleMorphDelta( sampler2D vt, const float3 vMorphTargetTextureDim, const float4 vMorphSubrect, const float flVertexID, const float flField )
{
	float flColumn = floor( flVertexID / vMorphSubrect.w );

	float4 t;
	t.x = vMorphSubrect.x + vMorphTargetTextureDim.z * flColumn + flField + 0.5f;
	t.y = vMorphSubrect.y + flVertexID - flColumn * vMorphSubrect.w + 0.5f;
	t.xy /= vMorphTargetTextureDim.xy;	
	t.z = t.w = 0.f;

	return tex2Dlod( vt, t );
}

// Optimized version which reads 2 deltas
void SampleMorphDelta2( sampler2D vt, const float3 vMorphTargetTextureDim, const float4 vMorphSubrect, const float flVertexID, out float4 delta1, out float4 delta2 )
{
	float flColumn = floor( flVertexID / vMorphSubrect.w );

	float4 t;
	t.x = vMorphSubrect.x + vMorphTargetTextureDim.z * flColumn + 0.5f;
	t.y = vMorphSubrect.y + flVertexID - flColumn * vMorphSubrect.w + 0.5f;
	t.xy /= vMorphTargetTextureDim.xy;	
	t.z = t.w = 0.f;

	delta1 = tex2Dlod( vt, t );
	t.x += 1.0f / vMorphTargetTextureDim.x;
	delta2 = tex2Dlod( vt, t );
}

#endif // SHADER_MODEL_VS_3_0


#if ( defined( SHADER_MODEL_VS_2_0 ) || defined( SHADER_MODEL_VS_3_0 ) )

//-----------------------------------------------------------------------------
// Method to apply morphs
//-----------------------------------------------------------------------------
bool ApplyMorph( float3 vPosFlex, inout float3 vPosition )
{
	// Flexes coming in from a separate stream
	float3 vPosDelta = vPosFlex.xyz * cFlexScale.x;
	vPosition.xyz += vPosDelta;
	return true;
}

bool ApplyMorph( float3 vPosFlex, float3 vNormalFlex, inout float3 vPosition, inout float3 vNormal )
{
	// Flexes coming in from a separate stream
	float3 vPosDelta = vPosFlex.xyz * cFlexScale.x;
	float3 vNormalDelta = vNormalFlex.xyz * cFlexScale.x;
	vPosition.xyz += vPosDelta;
	vNormal       += vNormalDelta;
	return true;
}

bool ApplyMorph( float3 vPosFlex, float3 vNormalFlex, 
	inout float3 vPosition, inout float3 vNormal, inout float3 vTangent )
{
	// Flexes coming in from a separate stream
	float3 vPosDelta = vPosFlex.xyz * cFlexScale.x;
	float3 vNormalDelta = vNormalFlex.xyz * cFlexScale.x;
	vPosition.xyz += vPosDelta;
	vNormal       += vNormalDelta;
	vTangent.xyz  += vNormalDelta;
	return true;
}

bool ApplyMorph( float4 vPosFlex, float3 vNormalFlex, 
	inout float3 vPosition, inout float3 vNormal, inout float3 vTangent, out float flWrinkle )
{
	// Flexes coming in from a separate stream
	float3 vPosDelta = vPosFlex.xyz * cFlexScale.x;
	float3 vNormalDelta = vNormalFlex.xyz * cFlexScale.x;
	flWrinkle = vPosFlex.w * cFlexScale.y;
	vPosition.xyz += vPosDelta;
	vNormal       += vNormalDelta;
	vTangent.xyz  += vNormalDelta;
	return true;
}

#endif // defined( SHADER_MODEL_VS_2_0 ) || defined( SHADER_MODEL_VS_3_0 )


#ifdef SHADER_MODEL_VS_3_0

bool ApplyMorph( sampler2D morphSampler, const float3 vMorphTargetTextureDim, const float4 vMorphSubrect, 
				const float flVertexID, const float3 vMorphTexCoord,
				inout float3 vPosition )
{
#if MORPHING

#if !DECAL
	// Flexes coming in from a separate stream
	float4 vPosDelta = SampleMorphDelta( morphSampler, vMorphTargetTextureDim, vMorphSubrect, flVertexID, 0 );
	vPosition	+= vPosDelta.xyz;
#else
	float4 t = float4( vMorphTexCoord.x, vMorphTexCoord.y, 0.0f, 0.0f );
	float3 vPosDelta = tex2Dlod( morphSampler, t );
	vPosition	+= vPosDelta.xyz * vMorphTexCoord.z;
#endif // DECAL

	return true;

#else // !MORPHING
	return false;
#endif
}
 
bool ApplyMorph( sampler2D morphSampler, const float3 vMorphTargetTextureDim, const float4 vMorphSubrect, 
				const float flVertexID, const float3 vMorphTexCoord, 
				inout float3 vPosition, inout float3 vNormal )
{
#if MORPHING

#if !DECAL
	float4 vPosDelta, vNormalDelta;
	SampleMorphDelta2( morphSampler, vMorphTargetTextureDim, vMorphSubrect, flVertexID, vPosDelta, vNormalDelta );
	vPosition	+= vPosDelta.xyz;
	vNormal		+= vNormalDelta.xyz;
#else
	float4 t = float4( vMorphTexCoord.x, vMorphTexCoord.y, 0.0f, 0.0f );
	float3 vPosDelta = tex2Dlod( morphSampler, t );
	t.x += 1.0f / vMorphTargetTextureDim.x;
	float3 vNormalDelta = tex2Dlod( morphSampler, t );
	vPosition	+= vPosDelta.xyz * vMorphTexCoord.z;
	vNormal		+= vNormalDelta.xyz * vMorphTexCoord.z;
#endif // DECAL

	return true;

#else // !MORPHING
	return false;
#endif
}

bool ApplyMorph( sampler2D morphSampler, const float3 vMorphTargetTextureDim, const float4 vMorphSubrect, 
				const float flVertexID, const float3 vMorphTexCoord, 
				inout float3 vPosition, inout float3 vNormal, inout float3 vTangent )
{
#if MORPHING

#if !DECAL
	float4 vPosDelta, vNormalDelta;
	SampleMorphDelta2( morphSampler, vMorphTargetTextureDim, vMorphSubrect, flVertexID, vPosDelta, vNormalDelta );
	vPosition	+= vPosDelta.xyz;
	vNormal		+= vNormalDelta.xyz;
	vTangent	+= vNormalDelta.xyz;
#else
	float4 t = float4( vMorphTexCoord.x, vMorphTexCoord.y, 0.0f, 0.0f );
	float3 vPosDelta = tex2Dlod( morphSampler, t );
	t.x += 1.0f / vMorphTargetTextureDim.x;
	float3 vNormalDelta = tex2Dlod( morphSampler, t );
	vPosition	+= vPosDelta.xyz * vMorphTexCoord.z;
	vNormal		+= vNormalDelta.xyz * vMorphTexCoord.z;
	vTangent	+= vNormalDelta.xyz * vMorphTexCoord.z;
#endif // DECAL

	return true;

#else // MORPHING

	return false;
#endif
}

bool ApplyMorph( sampler2D morphSampler, const float3 vMorphTargetTextureDim, const float4 vMorphSubrect,
	const float flVertexID, const float3 vMorphTexCoord,
	inout float3 vPosition, inout float3 vNormal, inout float3 vTangent, out float flWrinkle )
{
#if MORPHING

#if !DECAL
	float4 vPosDelta, vNormalDelta;
	SampleMorphDelta2( morphSampler, vMorphTargetTextureDim, vMorphSubrect, flVertexID, vPosDelta, vNormalDelta );
	vPosition	+= vPosDelta.xyz;
	vNormal		+= vNormalDelta.xyz;
	vTangent	+= vNormalDelta.xyz;
	flWrinkle = vPosDelta.w;
#else
	float4 t = float4( vMorphTexCoord.x, vMorphTexCoord.y, 0.0f, 0.0f );
	float4 vPosDelta = tex2Dlod( morphSampler, t );
	t.x += 1.0f / vMorphTargetTextureDim.x;
	float3 vNormalDelta = tex2Dlod( morphSampler, t );

	vPosition	+= vPosDelta.xyz * vMorphTexCoord.z;
	vNormal		+= vNormalDelta.xyz * vMorphTexCoord.z;
	vTangent	+= vNormalDelta.xyz * vMorphTexCoord.z;
	flWrinkle	= vPosDelta.w * vMorphTexCoord.z;
#endif // DECAL

	return true;

#else // MORPHING

	flWrinkle = 0.0f;
	return false;

#endif
}

#endif   // SHADER_MODEL_VS_3_0


float RangeFog( const float3 projPos )
{
	return max( cFogMaxDensity, ( -projPos.z * cOOFogRange + cFogEndOverFogRange ) );
}

float WaterFog( const float3 worldPos, const float3 projPos )
{
	float4 tmp;
	
	tmp.xy = cEyePosWaterZ.wz - worldPos.z;

	// tmp.x is the distance from the water surface to the vert
	// tmp.y is the distance from the eye position to the vert

	// if $tmp.x < 0, then set it to 0
	// This is the equivalent of moving the vert to the water surface if it's above the water surface
	
	tmp.x = max( 0.0f, tmp.x );

	// $tmp.w = $tmp.x / $tmp.y
	tmp.w = tmp.x / tmp.y;

	tmp.w *= projPos.z;

	// $tmp.w is now the distance that we see through water.

	return max( cFogMaxDensity, ( -tmp.w * cOOFogRange + cFogOne ) );
}

float CalcFog( const float3 worldPos, const float3 projPos, const int fogType )
{
#if defined( _X360 )
	// 360 only does pixel fog
	return 1.0f;
#endif

	if( fogType == FOGTYPE_RANGE )
	{
		return RangeFog( projPos );
	}
	else
	{
#if SHADERMODEL_VS_2_0 == 1
		// We do this work in the pixel shader in dx9, so don't do any fog here.
		return 1.0f;
#else
		return WaterFog( worldPos, projPos );
#endif
	}
}

float CalcFog( const float3 worldPos, const float3 projPos, const bool bWaterFog )
{
#if defined( _X360 )
	// 360 only does pixel fog
	return 1.0f;
#endif

	float flFog;
	if( !bWaterFog )
	{
		flFog = RangeFog( projPos );
	}
	else
	{
#if SHADERMODEL_VS_2_0 == 1
		// We do this work in the pixel shader in dx9, so don't do any fog here.
		flFog = 1.0f;
#else
		flFog = WaterFog( worldPos, projPos );
#endif
	}

	return flFog;
}

float4 DecompressBoneWeights( const float4 weights )
{
	float4 result = weights;

	if ( COMPRESSED_VERTS )
	{
		// Decompress from SHORT2 to float. In our case, [-1, +32767] -> [0, +1]
		// NOTE: we add 1 here so we can divide by 32768 - which is exact (divide by 32767 is not).
		//       This avoids cracking between meshes with different numbers of bone weights.
		//       We use SHORT2 instead of SHORT2N for a similar reason - the GPU's conversion
		//       from [-32768,+32767] to [-1,+1] is imprecise in the same way.
		result += 1;
		result /= 32768;
	}

	return result;
}

void SkinPosition( bool bSkinning, const float4 modelPos, 
                   const float4 boneWeights, float4 fBoneIndices,
				   out float3 worldPos )
{
#if !defined( _X360 )
	int3 boneIndices = D3DCOLORtoUBYTE4( fBoneIndices );
#else
	int3 boneIndices = fBoneIndices;
#endif

	// Needed for invariance issues caused by multipass rendering
#if defined( _X360 )
	[isolate] 
#endif
	{ 
		if ( !bSkinning )
		{
			worldPos = mul4x3( modelPos, cModel[0] );
		}
		else // skinning - always three bones
		{
			float4x3 mat1 = cModel[boneIndices[0]];
			float4x3 mat2 = cModel[boneIndices[1]];
			float4x3 mat3 = cModel[boneIndices[2]];

			float3 weights = DecompressBoneWeights( boneWeights ).xyz;
			weights[2] = 1 - (weights[0] + weights[1]);

			float4x3 blendMatrix = mat1 * weights[0] + mat2 * weights[1] + mat3 * weights[2];
			worldPos = mul4x3( modelPos, blendMatrix );
		}
	}
}

void SkinPositionAndNormal( bool bSkinning, const float4 modelPos, const float3 modelNormal,
                            const float4 boneWeights, float4 fBoneIndices,
						    out float3 worldPos, out float3 worldNormal )
{
	// Needed for invariance issues caused by multipass rendering
#if defined( _X360 )
	[isolate] 
#endif
	{ 

#if !defined( _X360 )
		int3 boneIndices = D3DCOLORtoUBYTE4( fBoneIndices );
#else
		int3 boneIndices = fBoneIndices;
#endif

		if ( !bSkinning )
		{
			worldPos = mul4x3( modelPos, cModel[0] );
			worldNormal = mul3x3( modelNormal, ( const float3x3 )cModel[0] );
		}
		else // skinning - always three bones
		{
			float4x3 mat1 = cModel[boneIndices[0]];
			float4x3 mat2 = cModel[boneIndices[1]];
			float4x3 mat3 = cModel[boneIndices[2]];

			float3 weights = DecompressBoneWeights( boneWeights ).xyz;
			weights[2] = 1 - (weights[0] + weights[1]);

			float4x3 blendMatrix = mat1 * weights[0] + mat2 * weights[1] + mat3 * weights[2];
			worldPos = mul4x3( modelPos, blendMatrix );
			worldNormal = mul3x3( modelNormal, ( float3x3 )blendMatrix );
		}

	} // end [isolate]
}

// Is it worth keeping SkinPosition and SkinPositionAndNormal around since the optimizer
// gets rid of anything that isn't used?
void SkinPositionNormalAndTangentSpace( 
							bool bSkinning,
						    const float4 modelPos, const float3 modelNormal, 
							const float4 modelTangentS,
                            const float4 boneWeights, float4 fBoneIndices,
						    out float3 worldPos, out float3 worldNormal, 
							out float3 worldTangentS, out float3 worldTangentT )
{
#if !defined( _X360 )
	int3 boneIndices = D3DCOLORtoUBYTE4( fBoneIndices );
#else
	int3 boneIndices = fBoneIndices;
#endif

	// Needed for invariance issues caused by multipass rendering
#if defined( _X360 )
	[isolate] 
#endif
	{ 
		if ( !bSkinning )
		{
			worldPos = mul4x3( modelPos, cModel[0] );
			worldNormal = mul3x3( modelNormal, ( const float3x3 )cModel[0] );
			worldTangentS = mul3x3( ( float3 )modelTangentS, ( const float3x3 )cModel[0] );
		}
		else // skinning - always three bones
		{
			float4x3 mat1 = cModel[boneIndices[0]];
			float4x3 mat2 = cModel[boneIndices[1]];
			float4x3 mat3 = cModel[boneIndices[2]];

			float3 weights = DecompressBoneWeights( boneWeights ).xyz;
			weights[2] = 1 - (weights[0] + weights[1]);

			float4x3 blendMatrix = mat1 * weights[0] + mat2 * weights[1] + mat3 * weights[2];
			worldPos = mul4x3( modelPos, blendMatrix );
			worldNormal = mul3x3( modelNormal, ( const float3x3 )blendMatrix );
			worldTangentS = mul3x3( ( float3 )modelTangentS, ( const float3x3 )blendMatrix );
		}
		worldTangentT = cross( worldNormal, worldTangentS ) * modelTangentS.w;
	}
}


//-----------------------------------------------------------------------------
// Lighting helper functions
//-----------------------------------------------------------------------------

float3 AmbientLight( const float3 worldNormal )
{
	float3 nSquared = worldNormal * worldNormal;
	int3 isNegative = ( worldNormal < 0.0 );
	float3 linearColor;
	linearColor = nSquared.x * cAmbientCubeX[isNegative.x] +
	              nSquared.y * cAmbientCubeY[isNegative.y] +
	              nSquared.z * cAmbientCubeZ[isNegative.z];
	return linearColor;
}

// The following "internal" routines are called "privately" by other routines in this file which
// handle the particular flavor of vs20 control flow appropriate to the original caller
float VertexAttenInternal( const float3 worldPos, int lightNum )
{
	float result = 0.0f;

	// Get light direction
	float3 lightDir = cLightInfo[lightNum].pos - worldPos;

	// Get light distance squared.
	float lightDistSquared = dot( lightDir, lightDir );

	// Get 1/lightDistance
	float ooLightDist = rsqrt( lightDistSquared );

	// Normalize light direction
	lightDir *= ooLightDist;

	float3 vDist;
#	if defined( _X360 )
	{
		//X360 dynamic compile hits an internal compiler error using dst(), this is the breakdown of how dst() works from the 360 docs.
		vDist.x = 1;
		vDist.y = lightDistSquared * ooLightDist;
		vDist.z = lightDistSquared;
		//flDist.w = ooLightDist;
	}
#	else
	{
		vDist = dst( lightDistSquared, ooLightDist );
	}
#	endif

	float flDistanceAtten = 1.0f / dot( cLightInfo[lightNum].atten.xyz, vDist );

	// Spot attenuation
	float flCosTheta = dot( cLightInfo[lightNum].dir.xyz, -lightDir );
	float flSpotAtten = (flCosTheta - cLightInfo[lightNum].spotParams.z) * cLightInfo[lightNum].spotParams.w;
	flSpotAtten = max( 0.0001f, flSpotAtten );
	flSpotAtten = pow( flSpotAtten, cLightInfo[lightNum].spotParams.x );
	flSpotAtten = saturate( flSpotAtten );

	// Select between point and spot
	float flAtten = lerp( flDistanceAtten, flDistanceAtten * flSpotAtten, cLightInfo[lightNum].dir.w );

	// Select between above and directional (no attenuation)
	result = lerp( flAtten, 1.0f, cLightInfo[lightNum].color.w );

	return result;
}

float CosineTermInternal( const float3 worldPos, const float3 worldNormal, int lightNum, bool bHalfLambert )
{
	// Calculate light direction assuming this is a point or spot
	float3 lightDir = normalize( cLightInfo[lightNum].pos - worldPos );

	// Select the above direction or the one in the structure, based upon light type
	lightDir = lerp( lightDir, -cLightInfo[lightNum].dir, cLightInfo[lightNum].color.w );

	// compute N dot L
	float NDotL = dot( worldNormal, lightDir );

	if ( !bHalfLambert )
	{
		NDotL = max( 0.0f, NDotL );
	}
	else	// Half-Lambert
	{
		NDotL = NDotL * 0.5 + 0.5;
		NDotL = NDotL * NDotL;
	}
	return NDotL;
}

// This routine uses booleans to do early-outs and is meant to be called by routines OUTSIDE of this file
float GetVertexAttenForLight( const float3 worldPos, int lightNum, bool bUseStaticControlFlow )
{
	float result = 0.0f;

	// Direct3D uses static control flow but OpenGL currently does not
	if ( bUseStaticControlFlow )
	{
		if ( g_bLightEnabled[lightNum] )
		{
			result = VertexAttenInternal( worldPos, lightNum );
		}
	}
	else // OpenGL non-static-control-flow path
	{
		result = VertexAttenInternal( worldPos, lightNum );
	}

	return result;
}

float3 DoLightInternal( const float3 worldPos, const float3 worldNormal, int lightNum, bool bHalfLambert )
{
	return cLightInfo[lightNum].color *
		CosineTermInternal( worldPos, worldNormal, lightNum, bHalfLambert ) *
		VertexAttenInternal( worldPos, lightNum );
}

float3 DoLighting( const float3 worldPos, const float3 worldNormal,
				   const float3 staticLightingColor, const bool bStaticLight,
				   const bool bDynamicLight, bool bHalfLambert )
{
	float3 linearColor = float3( 0.0f, 0.0f, 0.0f );

	if( bStaticLight )			// Static light
	{
		float3 col = staticLightingColor * cOverbright;
#if defined ( _X360 )
		linearColor += col * col;
#else
		linearColor += GammaToLinear( col );
#endif
	}

	if( bDynamicLight )			// Dynamic light
	{
		for (int i = 0; i < g_nLightCount; i++)
		{
			linearColor += DoLightInternal( worldPos, worldNormal, i, bHalfLambert );
		}		
	}

	if( bDynamicLight )
	{
		linearColor += AmbientLight( worldNormal ); //ambient light is already remapped
	}

	return linearColor;
}

float3 DoLightingUnrolled( const float3 worldPos, const float3 worldNormal,
				  const float3 staticLightingColor, const bool bStaticLight,
				  const bool bDynamicLight, bool bHalfLambert, const int nNumLights )
{
	float3 linearColor = float3( 0.0f, 0.0f, 0.0f );

	if( bStaticLight )			// Static light
	{
		linearColor += GammaToLinear( staticLightingColor * cOverbright );
	}

	if( bDynamicLight )			// Ambient light
	{
		if ( nNumLights >= 1 )
			linearColor += DoLightInternal( worldPos, worldNormal, 0, bHalfLambert );
		if ( nNumLights >= 2 )
			linearColor += DoLightInternal( worldPos, worldNormal, 1, bHalfLambert );
		if ( nNumLights >= 3 )
			linearColor += DoLightInternal( worldPos, worldNormal, 2, bHalfLambert );
		if ( nNumLights >= 4 )
			linearColor += DoLightInternal( worldPos, worldNormal, 3, bHalfLambert );
	}

	if( bDynamicLight )
	{
		linearColor += AmbientLight( worldNormal ); //ambient light is already remapped
	}

	return linearColor;
}

int4 FloatToInt( in float4 floats )
{
	return D3DCOLORtoUBYTE4( floats.zyxw / 255.001953125 );
}

float2 ComputeSphereMapTexCoords( in float3 reflectionVector )
{
	// transform reflection vector into view space
	reflectionVector = mul( reflectionVector, ( float3x3 )cViewModel );

	// generate <rx ry rz+1>
	float3 tmp = float3( reflectionVector.x, reflectionVector.y, reflectionVector.z + 1.0f );

	// find 1 / len
	float ooLen = dot( tmp, tmp );
	ooLen = 1.0f / sqrt( ooLen );

	// tmp = tmp/|tmp| + 1
	tmp.xy = ooLen * tmp.xy + 1.0f;

	return tmp.xy * 0.5f;
}


#define DEFORMATION_CLAMP_TO_BOX_IN_WORLDSPACE 1
							// minxyz.minsoftness / maxxyz.maxsoftness
float3 ApplyDeformation( float3 worldpos, int deftype, float4 defparms0, float4 defparms1,
						 float4 defparms2, float4 defparms3 )
{
	float3 ret = worldpos;
	if ( deftype == DEFORMATION_CLAMP_TO_BOX_IN_WORLDSPACE )
	{
		ret=max( ret, defparms2.xyz );
		ret=min( ret, defparms3.xyz );
	}

	return ret;
}


#endif //#ifndef COMMON_VS_FXC_H_