csgo/cstrike15_src/utils/vrad/lightmap.cpp

//========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
//
// Purpose: 
//
// $NoKeywords: $
//
//=============================================================================//

#include "vrad.h"
#include "lightmap.h"
#include "radial.h"
#include "mathlib/bumpvects.h"
#include "tier1/utlvector.h"
#include "vmpi.h"
#include "mathlib/anorms.h"
#include "map_utils.h"
#include "mathlib/halton.h"
#include "imagepacker.h"
#include "tier1/utlrbtree.h"
#include "tier1/utlbuffer.h"
#include "bitmap/tgawriter.h"
#include "mathlib/quantize.h"
#include "bitmap/imageformat.h"
#include "coordsize.h"

enum
{
	AMBIENT_ONLY = 0x1,
	NON_AMBIENT_ONLY = 0x2,
};

#define SMOOTHING_GROUP_HARD_EDGE	0xff000000

//==========================================================================//
// Ambient occlusion
//==========================================================================//
bool g_bNoSoften = false;
bool g_bNoAO = false;

float CalculateAmbientOcclusion( Vector *pPosition, Vector *pNormal )
{
	// Just call through to the simd version of this function
	FourVectors position4;
	position4.DuplicateVector( *pPosition );

	FourVectors normal4;
	position4.DuplicateVector( *pNormal );

	fltx4 ao = CalculateAmbientOcclusion4( position4, normal4, -1 );

	return SubFloat( ao, 0 );
}

fltx4 CalculateAmbientOcclusion4( const FourVectors &position4, const FourVectors &normal4, int static_prop_index_to_ignore )
{
	if ( g_bNoAO )
	{
		return Four_Ones;
	}

	DirectionalSampler_t sampler;
	int nSamples = 32;
	if ( do_fast )
	{
		nSamples /= 2;
	}

	fltx4 totalVisible = Four_Zeros;
	fltx4 totalPossibleVisible = Four_Zeros;
	for ( int i = 0; i < nSamples; i++ )
	{
		FourVectors rayStart = position4;
		rayStart += normal4;

		// Ray direction on the sphere
		FourVectors rayDirection;
		rayDirection.DuplicateVector( sampler.NextValue() );

		// Mirror ray along normal so all rays are on the hemisphere defined by the normal
		fltx4 rayDotN = rayDirection * normal4; // dot product
		fltx4 absRayDotN = AbsSIMD( rayDotN );
		rayDirection = rayDirection - Mul( normal4, rayDotN ) + Mul( normal4, absRayDotN );

		// Set length of ray
		FourVectors rayEnd = rayDirection;
		rayEnd *= 36.0f;
		rayEnd += rayStart;

		// Raytrace for visibility function
		fltx4 fractionVisible = Four_Ones;
		TestLine_IgnoreSky( rayStart, rayEnd, &fractionVisible, static_prop_index_to_ignore );
		totalVisible = AddSIMD( totalVisible, MulSIMD( fractionVisible, absRayDotN ) );
		totalPossibleVisible = AddSIMD( totalPossibleVisible, absRayDotN );
	}

	fltx4 ao = DivSIMD( totalVisible, totalPossibleVisible );
	ao = MulSIMD( ao, ao ); // Square ao term - This is an artistic choice by the CS:GO team
	return ao;
}

//==========================================================================//
// Give surfaces a softer look instead of the harsher linear N.L look
//==========================================================================//
float SoftenCosineTerm( float flDot )
{
	if ( g_bNoSoften )
		return flDot;

	flDot = MAX( flDot, 0.0f );
	return ( flDot + ( flDot * flDot ) ) * 0.5f; // This is cheaper than an exponent in shader code
}

fltx4 SoftenCosineTerm( fltx4 dots )
{
	if ( g_bNoSoften )
		return dots;

	dots = MaxSIMD( dots, Four_Zeros );
	fltx4 dotsSquared = MulSIMD( dots, dots );
	return MulSIMD( AddSIMD( dots, dotsSquared ), Four_PointFives );
}

//==========================================================================//
// CNormalList.
//==========================================================================//

// This class keeps a list of unique normals and provides a fast 
class CNormalList
{
public:
	CNormalList();
	
	// Adds the normal if unique. Otherwise, returns the normal's index into m_Normals.
	int FindOrAddNormal( Vector const &vNormal );


public:
	
	CUtlVector<Vector>	m_Normals;


private:

	// This represents a grid from (-1,-1,-1) to (1,1,1).
	enum {NUM_SUBDIVS = 8};
	CUtlVector<int>	m_NormalGrid[NUM_SUBDIVS][NUM_SUBDIVS][NUM_SUBDIVS];
};


int g_iCurFace;
edgeshare_t	edgeshare[MAX_MAP_EDGES];

Vector	face_centroids[MAX_MAP_EDGES];

int vertexref[MAX_MAP_VERTS];
int *vertexface[MAX_MAP_VERTS];
faceneighbor_t faceneighbor[MAX_MAP_FACES];

static directlight_t *gSkyLight = NULL;
static directlight_t *gAmbient = NULL;

//==========================================================================//
// CNormalList implementation.
//==========================================================================//

CNormalList::CNormalList() : m_Normals( 128 )
{
	for( int i=0; i < sizeof(m_NormalGrid)/sizeof(m_NormalGrid[0][0][0]); i++ )
	{
		(&m_NormalGrid[0][0][0] + i)->SetGrowSize( 16 );
	}
}

int CNormalList::FindOrAddNormal( Vector const &vNormal )
{
	int gi[3];

	// See which grid element it's in.
	for( int iDim=0; iDim < 3; iDim++ )
	{
		gi[iDim] = (int)( ((vNormal[iDim] + 1.0f) * 0.5f) * NUM_SUBDIVS - 0.000001f );
		gi[iDim] = min( gi[iDim], NUM_SUBDIVS );
		gi[iDim] = max( gi[iDim], 0 );
	}

	// Look for a matching vector in there.
	CUtlVector<int> *pGridElement = &m_NormalGrid[gi[0]][gi[1]][gi[2]];
	for( int i=0; i < pGridElement->Count(); i++ )
	{
		int iNormal = pGridElement->Element(i);

		Vector *pVec = &m_Normals[iNormal];
		//if( pVec->DistToSqr(vNormal) < 0.00001f )
		if( *pVec == vNormal )
			return iNormal;
	}

	// Ok, add a new one.
	pGridElement->AddToTail( m_Normals.Count() );
	return m_Normals.AddToTail( vNormal );
}

// FIXME: HACK until the plane normals are made more happy
void GetBumpNormals( const float* sVect, const float* tVect, const Vector& flatNormal, 
					 const Vector& phongNormal, Vector bumpNormals[NUM_BUMP_VECTS] )
{
	Vector stmp( sVect[0], sVect[1], sVect[2] );
	Vector ttmp( tVect[0], tVect[1], tVect[2] );
	GetBumpNormals( stmp, ttmp, flatNormal, phongNormal, bumpNormals );
}

int EdgeVertex( dface_t *f, int edge )
{
	int k;

	if (edge < 0)
		edge += f->numedges;
	else if (edge >= f->numedges)
		edge = edge % f->numedges;

	k = dsurfedges[f->firstedge + edge];
	if (k < 0)
	{
		// Msg("(%d %d) ", dedges[-k].v[1], dedges[-k].v[0] );
		return dedges[-k].v[1];
	}
	else
	{
		// Msg("(%d %d) ", dedges[k].v[0], dedges[k].v[1] );
		return dedges[k].v[0];
	}
}


/*
  ============
  PairEdges
  ============
*/
void PairEdges (void)
{
	int		        i, j, k, n, m;
	dface_t	        *f;
	int             numneighbors;
    int             tmpneighbor[64];
	faceneighbor_t  *fn;

	// count number of faces that reference each vertex
	for (i=0, f = g_pFaces; i<numfaces ; i++, f++)
	{
        for (j=0 ; j<f->numedges ; j++)
        {
            // Store the count in vertexref
            vertexref[EdgeVertex(f,j)]++;
        }
	}

	// allocate room
	for (i = 0; i < numvertexes; i++)
	{
		// use the count from above to allocate a big enough array
		vertexface[i] = ( int* )calloc( vertexref[i], sizeof( vertexface[0] ) );
		// clear the temporary data
		vertexref[i] = 0;
	}

	// store a list of every face that uses a particular vertex
	for (i=0, f = g_pFaces ; i<numfaces ; i++, f++)
	{
        for (j=0 ; j<f->numedges ; j++)
        {
            n = EdgeVertex(f,j);
            
            for (k = 0; k < vertexref[n]; k++)
            {
                if (vertexface[n][k] == i)
                    break;
            }
            if (k >= vertexref[n])
            {
                // add the face to the list
                vertexface[n][k] = i;
                vertexref[n]++;
            }
        }
	}

	// calc normals and set displacement surface flag
	for (i=0, f = g_pFaces; i<numfaces ; i++, f++)
	{
		fn = &faceneighbor[i];

		// get face normal
		VectorCopy( dplanes[f->planenum].normal, fn->facenormal );

		// set displacement surface flag
		fn->bHasDisp = false;
		if( ValidDispFace( f ) )
		{
			fn->bHasDisp = true;
		}
	}

	// find neighbors
	for (i=0, f = g_pFaces ; i<numfaces ; i++, f++)
	{
		numneighbors = 0;
		fn = &faceneighbor[i];

        // allocate room for vertex normals
        fn->normal = ( Vector* )calloc( f->numedges, sizeof( fn->normal[0] ) );
     		
        // look up all faces sharing vertices and add them to the list
        for (j=0 ; j<f->numedges ; j++)
        {
            n = EdgeVertex(f,j);
            
            for (k = 0; k < vertexref[n]; k++)
            {
                double	cos_normals_angle;
                Vector  *pNeighbornormal;
                
                // skip self
                if (vertexface[n][k] == i)
                    continue;

				// if this face doens't have a displacement -- don't consider displacement neighbors
				if( ( !fn->bHasDisp ) && ( faceneighbor[vertexface[n][k]].bHasDisp ) )
					continue;

                pNeighbornormal = &faceneighbor[vertexface[n][k]].facenormal;
                cos_normals_angle = DotProduct( *pNeighbornormal, fn->facenormal );
					
				// add normal if >= threshold or its a displacement surface (this is only if the original
				// face is a displacement)
				if ( fn->bHasDisp )
				{
					// Always smooth with and against a displacement surface.
					VectorAdd( fn->normal[j], *pNeighbornormal, fn->normal[j] );
				}
				else
				{
					// No smoothing - use of method (backwards compatibility).
					if ( ( f->smoothingGroups == 0 ) && ( g_pFaces[vertexface[n][k]].smoothingGroups == 0 ) )
					{
						if ( cos_normals_angle >= smoothing_threshold )
						{
							VectorAdd( fn->normal[j], *pNeighbornormal, fn->normal[j] );
						}
						else
						{
							// not considered a neighbor
							continue;
						}
					}
					else
					{
						unsigned int smoothingGroup = ( f->smoothingGroups & g_pFaces[vertexface[n][k]].smoothingGroups );

						// Hard edge.
						if ( ( smoothingGroup & SMOOTHING_GROUP_HARD_EDGE ) != 0 )
							continue;

						if ( smoothingGroup != 0 )
						{
							VectorAdd( fn->normal[j], *pNeighbornormal, fn->normal[j] );
						}
						else
						{
							// not considered a neighbor
							continue;
						}
					}
				}

				// look to see if we've already added this one
				for (m = 0; m < numneighbors; m++)
				{
					if (tmpneighbor[m] == vertexface[n][k])
						break;
				}
				
				if (m >= numneighbors)
				{
					// add to neighbor list
					tmpneighbor[m] = vertexface[n][k];
					numneighbors++;
					if ( numneighbors > ARRAYSIZE(tmpneighbor) )
					{
						Error("Stack overflow in neighbors\n");
					}
				}
            }
        }

        if (numneighbors)
        {
            // copy over neighbor list
            fn->numneighbors = numneighbors;
            fn->neighbor = ( int* )calloc( numneighbors, sizeof( fn->neighbor[0] ) );
            for (m = 0; m < numneighbors; m++)
            {
                fn->neighbor[m] = tmpneighbor[m];
            }
        }
        
		// fixup normals
        for (j = 0; j < f->numedges; j++)
        {
            VectorAdd( fn->normal[j], fn->facenormal, fn->normal[j] );
            VectorNormalize( fn->normal[j] );
        }
    }
}


void SaveVertexNormals( void )
{
	faceneighbor_t *fn;
	int i, j;
	dface_t *f;
	CNormalList normalList;

	g_numvertnormalindices = 0;

	for( i = 0 ;i<numfaces ; i++ )
	{
		fn = &faceneighbor[i];
		f = &g_pFaces[i];

		for( j = 0; j < f->numedges; j++ )
		{
			Vector vNormal; 
			if( fn->normal )
			{
				vNormal = fn->normal[j];
			}
			else
			{
				// original faces don't have normals
				vNormal.Init( 0, 0, 0 );
			}
			
			if( g_numvertnormalindices == MAX_MAP_VERTNORMALINDICES )
			{
				Error( "g_numvertnormalindices == MAX_MAP_VERTNORMALINDICES" );
			}
			
			g_vertnormalindices[g_numvertnormalindices] = (unsigned short)normalList.FindOrAddNormal( vNormal );
			g_numvertnormalindices++;
		}
	}

	if( normalList.m_Normals.Count() > MAX_MAP_VERTNORMALS )
	{
		Error( "g_numvertnormals > MAX_MAP_VERTNORMALS" );
	}

	// Copy the list of unique vert normals into g_vertnormals.
	g_numvertnormals = normalList.m_Normals.Count();
	memcpy( g_vertnormals, normalList.m_Normals.Base(), sizeof(g_vertnormals[0]) * normalList.m_Normals.Count() );
}

/*
  =================================================================

  LIGHTMAP SAMPLE GENERATION

  =================================================================
*/


//-----------------------------------------------------------------------------
// Purpose: Spits out an error message with information about a lightinfo_t.
// Input  : s - Error message string.
//			l - lightmap info struct.
//-----------------------------------------------------------------------------
void ErrorLightInfo(const char *s, lightinfo_t *l)
{
	texinfo_t *tex = &texinfo[l->face->texinfo];
	winding_t *w = WindingFromFace(&g_pFaces[l->facenum], l->modelorg);

	//
	// Show the face center and material name if possible.
	//
	if (w != NULL)
	{
		// Don't exit, we'll try to recover...
		Vector vecCenter;
		WindingCenter(w, vecCenter);
//		FreeWinding(w);

		Warning("%s at (%g, %g, %g)\n\tmaterial=%s\n", s, (double)vecCenter.x, (double)vecCenter.y, (double)vecCenter.z, TexDataStringTable_GetString( dtexdata[tex->texdata].nameStringTableID ) );
	}
	//
	// If not, just show the material name.
	//
	else
	{
		Warning("%s at (degenerate face)\n\tmaterial=%s\n", TexDataStringTable_GetString( dtexdata[tex->texdata].nameStringTableID ));
	}
}


void CalcFaceVectors(lightinfo_t *l)
{
	texinfo_t	*tex;
	int			i, j;

	tex = &texinfo[l->face->texinfo];
	
    // move into lightinfo_t
	for (i=0 ; i<2 ; i++)
	{
		for (j=0 ; j<3 ; j++)
		{
			l->worldToLuxelSpace[i][j] = tex->lightmapVecsLuxelsPerWorldUnits[i][j];
		}
	}

	//Solve[ { x * w00 + y * w01 + z * w02 - s == 0, x * w10 + y * w11 + z * w12 - t == 0, A * x + B * y + C * z + D == 0 }, { x, y, z } ]
	//Rule(x,(  C*s*w11 - B*s*w12 + B*t*w02 - C*t*w01 + D*w02*w11 - D*w01*w12) / (+ A*w01*w12 - A*w02*w11 + B*w02*w10 - B*w00*w12 + C*w00*w11 - C*w01*w10 )),
	//Rule(y,(  A*s*w12 - C*s*w10 + C*t*w00 - A*t*w02 + D*w00*w12 - D*w02*w10) / (+ A*w01*w12 - A*w02*w11 + B*w02*w10 - B*w00*w12 + C*w00*w11 - C*w01*w10 )),
	//Rule(z,(  B*s*w10 - A*s*w11 + A*t*w01 - B*t*w00 + D*w01*w10 - D*w00*w11) / (+ A*w01*w12 - A*w02*w11 + B*w02*w10 - B*w00*w12 + C*w00*w11 - C*w01*w10 ))))

	Vector luxelSpaceCross;

	luxelSpaceCross[0] = 
		tex->lightmapVecsLuxelsPerWorldUnits[1][1] * tex->lightmapVecsLuxelsPerWorldUnits[0][2] - 
		tex->lightmapVecsLuxelsPerWorldUnits[1][2] * tex->lightmapVecsLuxelsPerWorldUnits[0][1];
	luxelSpaceCross[1] = 
		tex->lightmapVecsLuxelsPerWorldUnits[1][2] * tex->lightmapVecsLuxelsPerWorldUnits[0][0] - 
		tex->lightmapVecsLuxelsPerWorldUnits[1][0] * tex->lightmapVecsLuxelsPerWorldUnits[0][2];
	luxelSpaceCross[2] = 
		tex->lightmapVecsLuxelsPerWorldUnits[1][0] * tex->lightmapVecsLuxelsPerWorldUnits[0][1] - 
		tex->lightmapVecsLuxelsPerWorldUnits[1][1] * tex->lightmapVecsLuxelsPerWorldUnits[0][0];

	float det = -DotProduct( l->facenormal, luxelSpaceCross );
	if ( fabs( det ) < 1.0e-20 )
	{
		Warning(" warning - face vectors parallel to face normal. bad lighting will be produced\n" );
		l->luxelOrigin = vec3_origin;
	}
	else
	{
		// invert the matrix
		l->luxelToWorldSpace[0][0]	= (l->facenormal[2] * l->worldToLuxelSpace[1][1] - l->facenormal[1] * l->worldToLuxelSpace[1][2]) / det;
		l->luxelToWorldSpace[1][0]	= (l->facenormal[1] * l->worldToLuxelSpace[0][2] - l->facenormal[2] * l->worldToLuxelSpace[0][1]) / det;
		l->luxelOrigin[0]			= -(l->facedist * luxelSpaceCross[0]) / det;
		l->luxelToWorldSpace[0][1]  = (l->facenormal[0] * l->worldToLuxelSpace[1][2] - l->facenormal[2] * l->worldToLuxelSpace[1][0]) / det; 
		l->luxelToWorldSpace[1][1]  = (l->facenormal[2] * l->worldToLuxelSpace[0][0] - l->facenormal[0] * l->worldToLuxelSpace[0][2]) / det; 
		l->luxelOrigin[1]			= -(l->facedist * luxelSpaceCross[1]) / det;
		l->luxelToWorldSpace[0][2]  = (l->facenormal[1] * l->worldToLuxelSpace[1][0] - l->facenormal[0] * l->worldToLuxelSpace[1][1]) / det; 
		l->luxelToWorldSpace[1][2]  = (l->facenormal[0] * l->worldToLuxelSpace[0][1] - l->facenormal[1] * l->worldToLuxelSpace[0][0]) / det; 
		l->luxelOrigin[2]			= -(l->facedist * luxelSpaceCross[2]) / det;

		// adjust for luxel offset
		VectorMA( l->luxelOrigin, -tex->lightmapVecsLuxelsPerWorldUnits[0][3], l->luxelToWorldSpace[0], l->luxelOrigin );
		VectorMA( l->luxelOrigin, -tex->lightmapVecsLuxelsPerWorldUnits[1][3], l->luxelToWorldSpace[1], l->luxelOrigin );
	}
	// compensate for org'd bmodels
	VectorAdd (l->luxelOrigin, l->modelorg, l->luxelOrigin);
}


winding_t *LightmapCoordWindingForFace( lightinfo_t *l )
{
	int			i;
	winding_t	*w;

	w = WindingFromFace( l->face, l->modelorg );

	for (i = 0; i < w->numpoints; i++)
	{
		Vector2D coord;
		WorldToLuxelSpace( l, w->p[i], coord );
		w->p[i].x = coord.x;
		w->p[i].y = coord.y;
		w->p[i].z = 0;
	}

	return w;
}


void WriteCoordWinding (FILE *out, lightinfo_t *l, winding_t *w, Vector& color )
{
	int			i;
	Vector		pos;

	fprintf (out, "%i\n", w->numpoints);
	for (i=0 ; i<w->numpoints ; i++)
	{
		LuxelSpaceToWorld( l, w->p[i][0], w->p[i][1], pos );
		fprintf (out, "%5.2f %5.2f %5.2f %5.3f %5.3f %5.3f\n",
				 pos[0],
				 pos[1],
				 pos[2],
				 color[ 0 ] / 256,
				 color[ 1 ] / 256,
				 color[ 2 ] / 256 );
	}
}


//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
void DumpFaces( lightinfo_t *pLightInfo, int ndxFace )
{
	static	FileHandle_t out;

	// get face data
	faceneighbor_t *fn = &faceneighbor[ndxFace];
	Vector &centroid = face_centroids[ndxFace];

	// disable threading (not a multi-threadable function!)
	ThreadLock();
	
	if( !out )
	{
		// open the file
		out = g_pFileSystem->Open( "face.txt", "w" );
		if( !out )
			return;
	}
	
	//
	// write out face
	//
	for( int ndxEdge = 0; ndxEdge < pLightInfo->face->numedges; ndxEdge++ )
	{
//		int edge = dsurfedges[pLightInfo->face->firstedge+ndxEdge];

		Vector p1, p2;
		VectorAdd( dvertexes[EdgeVertex( pLightInfo->face, ndxEdge )].point, pLightInfo->modelorg, p1 );
		VectorAdd( dvertexes[EdgeVertex( pLightInfo->face, ndxEdge+1 )].point, pLightInfo->modelorg, p2 );
		
		Vector &n1 = fn->normal[ndxEdge];
		Vector &n2 = fn->normal[(ndxEdge+1)%pLightInfo->face->numedges];
		
		CmdLib_FPrintf( out, "3\n");
		
		CmdLib_FPrintf(out, "%f %f %f %f %f %f\n", p1[0], p1[1], p1[2], n1[0] * 0.5 + 0.5, n1[1] * 0.5 + 0.5, n1[2] * 0.5 + 0.5 );
		
		CmdLib_FPrintf(out, "%f %f %f %f %f %f\n", p2[0], p2[1], p2[2], n2[0] * 0.5 + 0.5, n2[1] * 0.5 + 0.5, n2[2] * 0.5 + 0.5 );
		
		CmdLib_FPrintf(out, "%f %f %f %f %f %f\n", centroid[0] + pLightInfo->modelorg[0], 
					   centroid[1] + pLightInfo->modelorg[1], 
					   centroid[2] + pLightInfo->modelorg[2], 
					   fn->facenormal[0] * 0.5 + 0.5, 
					   fn->facenormal[1] * 0.5 + 0.5, 
					   fn->facenormal[2] * 0.5 + 0.5 );
		
	}
	
	// enable threading
	ThreadUnlock();
}


//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
bool BuildFacesamplesAndLuxels_DoFast( lightinfo_t *pLightInfo, facelight_t *pFaceLight )
{
	// lightmap size
	int width = pLightInfo->face->m_LightmapTextureSizeInLuxels[0]+1;
	int height = pLightInfo->face->m_LightmapTextureSizeInLuxels[1]+1;

	// ratio of world area / lightmap area
	texinfo_t *pTex = &texinfo[pLightInfo->face->texinfo];
	pFaceLight->worldAreaPerLuxel = 1.0 / ( sqrt( DotProduct( pTex->lightmapVecsLuxelsPerWorldUnits[0], 
															  pTex->lightmapVecsLuxelsPerWorldUnits[0] ) ) * 
											sqrt( DotProduct( pTex->lightmapVecsLuxelsPerWorldUnits[1], 
															  pTex->lightmapVecsLuxelsPerWorldUnits[1] ) ) );

	//
	// quickly create samples and luxels (copy over samples)
	//
	pFaceLight->numsamples = width * height;
	pFaceLight->sample = ( sample_t* )calloc( pFaceLight->numsamples, sizeof( *pFaceLight->sample ) );
	if( !pFaceLight->sample )
		return false;

	pFaceLight->numluxels = width * height;
	pFaceLight->luxel = ( Vector* )calloc( pFaceLight->numluxels, sizeof( *pFaceLight->luxel ) );
	if( !pFaceLight->luxel )
		return false;

	sample_t *pSamples = pFaceLight->sample;
	Vector	 *pLuxels = pFaceLight->luxel;

	for( int t = 0; t < height; t++ )
	{
		for( int s = 0; s < width; s++ )
		{
			pSamples->s = s;
			pSamples->t = t;
			pSamples->coord[0] = s;
			pSamples->coord[1] = t;
			// unused but initialized anyway
			pSamples->mins[0] = s - 0.5;
			pSamples->mins[1] = t - 0.5;
			pSamples->maxs[0] = s + 0.5;
			pSamples->maxs[1] = t + 0.5;
			pSamples->area = pFaceLight->worldAreaPerLuxel;
			LuxelSpaceToWorld( pLightInfo, pSamples->coord[0], pSamples->coord[1], pSamples->pos );
			VectorCopy( pSamples->pos, *pLuxels );

			pSamples++;
			pLuxels++;
		}
	}

	return true;
}


//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
bool BuildSamplesAndLuxels_DoFast( lightinfo_t *pLightInfo, facelight_t *pFaceLight, int ndxFace )
{
	// build samples for a "face"
	if( pLightInfo->face->dispinfo == -1 )
	{
		return BuildFacesamplesAndLuxels_DoFast( pLightInfo, pFaceLight );
	}
	// build samples for a "displacement"
	else
	{
		return StaticDispMgr()->BuildDispSamplesAndLuxels_DoFast( pLightInfo, pFaceLight, ndxFace );
	}
}


//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
bool BuildFacesamples( lightinfo_t *pLightInfo, facelight_t *pFaceLight )
{
	// lightmap size
	int width = pLightInfo->face->m_LightmapTextureSizeInLuxels[0]+1;
	int height = pLightInfo->face->m_LightmapTextureSizeInLuxels[1]+1;

	// ratio of world area / lightmap area
	texinfo_t *pTex = &texinfo[pLightInfo->face->texinfo];
	pFaceLight->worldAreaPerLuxel = 1.0 / ( sqrt( DotProduct( pTex->lightmapVecsLuxelsPerWorldUnits[0], 
															  pTex->lightmapVecsLuxelsPerWorldUnits[0] ) ) * 
											sqrt( DotProduct( pTex->lightmapVecsLuxelsPerWorldUnits[1], 
															  pTex->lightmapVecsLuxelsPerWorldUnits[1] ) ) );

	// allocate a large number of samples for creation -- get copied later!
	CUtlVector<sample_t> sampleData;
	sampleData.SetCount( SINGLE_BRUSH_MAP * 2 );
	sample_t *samples = sampleData.Base();
	sample_t *pSamples = samples;

	// lightmap space winding
	winding_t *pLightmapWinding = LightmapCoordWindingForFace( pLightInfo ); 

	//
	// build vector pointing along the lightmap cutting planes
	//
	Vector sNorm( 1.0f, 0.0f, 0.0f );
	Vector tNorm( 0.0f, 1.0f, 0.0f );

	// sample center offset
	float sampleOffset = ( do_centersamples ) ? 0.5 : 1.0;

	//
	// clip the lightmap "spaced" winding by the lightmap cutting planes
	//
	winding_t *pWindingT1, *pWindingT2;
	winding_t *pWindingS1, *pWindingS2;
	float dist;

	for( int t = 0; t < height && pLightmapWinding; t++ )
	{
		dist = t + sampleOffset;
		
		// lop off a sample in the t dimension
		// hack - need a separate epsilon for lightmap space since ON_EPSILON is for texture space
		ClipWindingEpsilon( pLightmapWinding, tNorm, dist, ON_EPSILON / 16.0f, &pWindingT1, &pWindingT2 );

		for( int s = 0; s < width && pWindingT2; s++ )
		{
			dist = s + sampleOffset;

			// lop off a sample in the s dimension, and put it in ws2
			// hack - need a separate epsilon for lightmap space since ON_EPSILON is for texture space
			ClipWindingEpsilon( pWindingT2, sNorm, dist, ON_EPSILON / 16.0f, &pWindingS1, &pWindingS2 );

			//
			// s2 winding is a single sample worth of winding
			//
			if( pWindingS2 )
			{
				// save the s, t positions
				pSamples->s = s;
				pSamples->t = t;

				// get the lightmap space area of ws2 and convert to world area
				// and find the center (then convert it to 2D)
				Vector center;
				pSamples->area = WindingAreaAndBalancePoint(  pWindingS2, center ) * pFaceLight->worldAreaPerLuxel;
				pSamples->coord[0] = center.x; 
				pSamples->coord[1] = center.y;

				// find winding bounds (then convert it to 2D)
				Vector minbounds, maxbounds;
				WindingBounds( pWindingS2, minbounds, maxbounds );
				pSamples->mins[0] = minbounds.x; 
				pSamples->mins[1] = minbounds.y;
				pSamples->maxs[0] = maxbounds.x; 
				pSamples->maxs[1] = maxbounds.y;

				// convert from lightmap space to world space
				LuxelSpaceToWorld( pLightInfo, pSamples->coord[0], pSamples->coord[1], pSamples->pos );

				if (g_bDumpPatches || (do_extra && pSamples->area < pFaceLight->worldAreaPerLuxel - EQUAL_EPSILON))
				{
					//
					// convert the winding from lightmaps space to world for debug rendering and sub-sampling
					//
					Vector worldPos;
					for( int ndxPt = 0; ndxPt < pWindingS2->numpoints; ndxPt++ )
					{
						LuxelSpaceToWorld( pLightInfo, pWindingS2->p[ndxPt].x, pWindingS2->p[ndxPt].y, worldPos );
						VectorCopy( worldPos, pWindingS2->p[ndxPt] );
					}
					pSamples->w = pWindingS2;
				}
				else
				{
					// winding isn't needed, free it.
					pSamples->w = NULL;
					FreeWinding( pWindingS2 );
				}

				pSamples++;
			}

			//
			// if winding T2 still exists free it and set it equal S1 (the rest of the row minus the sample just created)
			//
			if( pWindingT2 )
			{
				FreeWinding( pWindingT2 );
			}

			// clip the rest of "s"
			pWindingT2 = pWindingS1;
		}

		//
		// if the original lightmap winding exists free it and set it equal to T1 (the rest of the winding not cut into samples) 
		//
		if( pLightmapWinding )
		{
			FreeWinding( pLightmapWinding );
		}

		if( pWindingT2 )
		{
			FreeWinding( pWindingT2 );
		}

		pLightmapWinding = pWindingT1;
	}

	//
	// copy over samples
	//
	pFaceLight->numsamples = pSamples - samples;
	pFaceLight->sample = ( sample_t* )calloc( pFaceLight->numsamples, sizeof( *pFaceLight->sample ) );
	if( !pFaceLight->sample )
		return false;

	memcpy( pFaceLight->sample, samples, pFaceLight->numsamples * sizeof( *pFaceLight->sample ) );

	// supply a default sample normal (face normal - assumed flat)
	for( int ndxSample = 0; ndxSample < pFaceLight->numsamples; ndxSample++ )
	{
		Assert ( VectorLength ( pLightInfo->facenormal ) > 1.0e-20);
		pFaceLight->sample[ndxSample].normal = pLightInfo->facenormal;
	}

	// statistics - warning?!
	if( pFaceLight->numsamples == 0 )
	{
		Msg( "no samples %d\n", pLightInfo->face - g_pFaces );
	}

	return true;
}


//-----------------------------------------------------------------------------
// Purpose: Free any windings used by this facelight. It's currently assumed they're not needed again
//-----------------------------------------------------------------------------
void FreeSampleWindings( facelight_t *fl )
{
	int i;
	for (i = 0; i < fl->numsamples; i++)
	{
		if (fl->sample[i].w)
		{
			FreeWinding( fl->sample[i].w );
			fl->sample[i].w = NULL;
		}
	}
}


//-----------------------------------------------------------------------------
// Purpose: build the sample data for each lightmapped primitive type
//-----------------------------------------------------------------------------
bool BuildSamples( lightinfo_t *pLightInfo, facelight_t *pFaceLight, int ndxFace )
{
	// build samples for a "face"
	if( pLightInfo->face->dispinfo == -1 )
	{
		return BuildFacesamples( pLightInfo, pFaceLight );
	}
	// build samples for a "displacement"
	else
	{
		return StaticDispMgr()->BuildDispSamples( pLightInfo, pFaceLight, ndxFace );
	}
}


//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
bool BuildFaceLuxels( lightinfo_t *pLightInfo, facelight_t *pFaceLight )
{
	// lightmap size
	int width = pLightInfo->face->m_LightmapTextureSizeInLuxels[0]+1;
	int height = pLightInfo->face->m_LightmapTextureSizeInLuxels[1]+1;

	// calcuate actual luxel points
	pFaceLight->numluxels = width * height;
	pFaceLight->luxel = ( Vector* )calloc( pFaceLight->numluxels, sizeof( *pFaceLight->luxel ) );
	if( !pFaceLight->luxel )
		return false;

	for( int t = 0; t < height; t++ )
	{
		for( int s = 0; s < width; s++ )
		{
			LuxelSpaceToWorld( pLightInfo, s, t, pFaceLight->luxel[s+t*width] );
		}
	}

	return true;
}


//-----------------------------------------------------------------------------
// Purpose: build the luxels (find the luxel centers) for each lightmapped
//          primitive type
//-----------------------------------------------------------------------------
bool BuildLuxels( lightinfo_t *pLightInfo, facelight_t *pFaceLight, int ndxFace )
{
	// build luxels for a "face"
	if( pLightInfo->face->dispinfo == -1 )
	{
		return BuildFaceLuxels( pLightInfo, pFaceLight );
	}
	// build luxels for a "displacement"
	else
	{
		return StaticDispMgr()->BuildDispLuxels( pLightInfo, pFaceLight, ndxFace );
	}
}


//-----------------------------------------------------------------------------
// Purpose: for each face, find the center of each luxel; for each texture
//          aligned grid point, back project onto the plane and get the world
//          xyz value of the sample point
// NOTE: ndxFace = facenum
//-----------------------------------------------------------------------------
void CalcPoints( lightinfo_t *pLightInfo, facelight_t *pFaceLight, int ndxFace )
{
	// debugging!
	if( g_bDumpPatches ) 
	{
		DumpFaces( pLightInfo, ndxFace );
	}

	// quick and dirty!
	if( do_fast )
	{
		if( !BuildSamplesAndLuxels_DoFast( pLightInfo, pFaceLight, ndxFace ) )
		{
			Msg( "Face %d: (Fast)Error Building Samples and Luxels\n", ndxFace );
		}
		return;
	}

	// build the samples
	if( !BuildSamples( pLightInfo, pFaceLight, ndxFace ) )
	{
		Msg( "Face %d: Error Building Samples\n", ndxFace );
	}

	// build the luxels
	if( !BuildLuxels( pLightInfo, pFaceLight, ndxFace ) )
	{
		Msg( "Face %d: Error Building Luxels\n", ndxFace );
	}
}


//==============================================================

directlight_t	*activelights;
directlight_t	*freelights;

facelight_t		facelight[MAX_MAP_FACES];
int				numdlights;

/*
  ==================
  FindTargetEntity
  ==================
*/
entity_t *FindTargetEntity (char *target)
{
	int		i;
	char	*n;

	for (i=0 ; i<num_entities ; i++)
	{
		n = ValueForKey (&entities[i], "targetname");
		if (!strcmp (n, target))
			return &entities[i];
	}

	return NULL;
}


/*
  =============
  AllocDLight
  =============
*/

int GetVisCache( int lastoffset, int cluster, byte *pvs );
void SetDLightVis( directlight_t *dl, int cluster );
void MergeDLightVis( directlight_t *dl, int cluster );

directlight_t *AllocDLight( Vector& origin, bool bAddToList )
{
	directlight_t *dl;

	dl = ( directlight_t* )calloc(1, sizeof(directlight_t));
	dl->index = numdlights++;

	VectorCopy( origin, dl->light.origin );

	dl->light.cluster = ClusterFromPoint(dl->light.origin);
	SetDLightVis( dl, dl->light.cluster );

	dl->facenum = -1;

	if ( bAddToList )
	{
		dl->next = activelights;
		activelights = dl;
	}

	return dl;
}

void AddDLightToActiveList( directlight_t *dl )
{
	dl->next = activelights;
	activelights = dl;
}

void FreeDLights()
{
	gSkyLight = NULL;
	gAmbient = NULL;

	directlight_t *pNext;
	for( directlight_t *pCur=activelights; pCur; pCur=pNext )
	{
		pNext = pCur->next;
		free( pCur );
	}
	activelights = 0;
}


void SetDLightVis( directlight_t *dl, int cluster )
{
	if (dl->pvs == NULL)
	{
		dl->pvs = (byte *)calloc( 1, (dvis->numclusters / 8) + 1 );
	}

	GetVisCache( -1, cluster, dl->pvs );
}

void MergeDLightVis( directlight_t *dl, int cluster )
{
	if (dl->pvs == NULL)
	{
		SetDLightVis( dl, cluster );
	}
	else
	{
		byte		pvs[MAX_MAP_CLUSTERS/8];
		GetVisCache( -1, cluster, pvs );

		// merge both vis graphs
		for (int i = 0; i < (dvis->numclusters / 8) + 1; i++)
		{
			dl->pvs[i] |= pvs[i];
		}
	}
}


/*
  =============
  LightForKey
  =============
*/
int LightForKey (entity_t *ent, char *key, Vector& intensity )
{
	char *pLight;

	pLight = ValueForKey( ent, key );

	return LightForString( pLight, intensity );
}

int LightForString( char *pLight, Vector& intensity )
{
	double r, g, b, scaler;
	int argCnt;

	VectorFill( intensity, 0 );

	// scanf into doubles, then assign, so it is vec_t size independent
	r = g = b = scaler = 0;
	double r_hdr,g_hdr,b_hdr,scaler_hdr;
	argCnt = sscanf ( pLight, "%lf %lf %lf %lf %lf %lf %lf %lf", 
					  &r, &g, &b, &scaler, &r_hdr,&g_hdr,&b_hdr,&scaler_hdr );

	if (argCnt==8) 											// 2 4-tuples
	{
		if (g_bHDR)
		{
			r=r_hdr;
			g=g_hdr;
			b=b_hdr;
			scaler=scaler_hdr;
		}
		argCnt=4;
	}

	// make sure light is legal
	if( r < 0.0f || g < 0.0f || b < 0.0f || scaler < 0.0f )
	{
		intensity.Init( 0.0f, 0.0f, 0.0f );
		return false;
	}

	intensity[0] = pow( r / 255.0, 2.2 ) * 255;				// convert to linear
	
	switch( argCnt)
	{
		case 1:
			// The R,G,B values are all equal.
			intensity[1] = intensity[2] = intensity[0]; 
			break;
			
		case 3:
		case 4:
			// Save the other two G,B values.
			intensity[1] = pow( g / 255.0, 2.2 ) * 255;
			intensity[2] = pow( b / 255.0, 2.2 ) * 255;
			
			// Did we also get an "intensity" scaler value too?
			if ( argCnt == 4 )
			{
				// Scale the normalized 0-255 R,G,B values by the intensity scaler
				VectorScale( intensity, scaler / 255.0, intensity );
			}
			break;

		default:
			printf("unknown light specifier type - %s\n",pLight);
			return false;
	}
	// scale up source lights by scaling factor
	VectorScale( intensity, lightscale, intensity );
	return true;
}

//-----------------------------------------------------------------------------
// Various parsing methods
//-----------------------------------------------------------------------------

static void ParseLightGeneric( entity_t *e, directlight_t *dl )
{
	entity_t		*e2;
	char	        *target;
	Vector	        dest;

	dl->light.style = (int)FloatForKey (e, "style");
	dl->m_bSkyLightIsDirectionalLight = false;

	if( (int)FloatForKeyWithDefault(e, "_castentityshadow", 1.0f ) != 0 )
	{
		dl->light.flags |= DWL_FLAGS_CASTENTITYSHADOWS;
	}
	else
	{
		dl->light.flags &= ~DWL_FLAGS_CASTENTITYSHADOWS;
	}

	Vector shadowOffset( 0.0f, 0.0f, 0.0f );
	GetVectorForKey (e, "_shadoworiginoffset", shadowOffset );
	dl->light.shadow_cast_offset = shadowOffset;

	// get intensity
	if( g_bHDR && LightForKey( e, "_lightHDR", dl->light.intensity ) ) 
	{
	}
	else
	{
		LightForKey( e, "_light", dl->light.intensity );
	}
	
	// check angle, targets
	target = ValueForKey (e, "target");
	if (target[0])
	{	// point towards target
		e2 = FindTargetEntity (target);
		if (!e2)
			Warning("WARNING: light at (%i %i %i) has missing target\n",
					(int)dl->light.origin[0], (int)dl->light.origin[1], (int)dl->light.origin[2]);
		else
		{
			GetVectorForKey (e2, "origin", dest);
			VectorSubtract (dest, dl->light.origin, dl->light.normal);
			VectorNormalize (dl->light.normal);
		}
	}
	else
	{	
		// point down angle
		Vector angles;
		GetVectorForKey( e, "angles", angles );
		float pitch = FloatForKey (e, "pitch");
		float angle = FloatForKey (e, "angle");
		SetupLightNormalFromProps( QAngle( angles.x, angles.y, angles.z ), angle, pitch, dl->light.normal );
	}
	if ( g_bHDR )
		VectorScale( dl->light.intensity, 
					 FloatForKeyWithDefault( e, "_lightscaleHDR", 1.0 ),
					 dl->light.intensity );
}

static void SetLightFalloffParams( entity_t * e, directlight_t * dl )
{
	dl->m_flStartFadeDistance = 0;
	dl->m_flEndFadeDistance = - 1;
	dl->m_flCapDist = 1.0e22;
	if ( g_bFiniteFalloffModel )
	{
		float d0 = FloatForKey( e, "_zero_percent_distance" );
		dl->light.constant_attn = 1.0;
		dl->light.linear_attn = 0; //-2.0 * ( 1.0 / d0 );
		dl->light.quadratic_attn = -1.0 / ( d0 * d0 );
// 		for(float d=0;d<200;d+=20)
// 			printf("at %f, %f\n",d,1.0+d*d*dl->light.quadratic_attn );
	}
	else
	{
		float d50=FloatForKey( e, "_fifty_percent_distance" );
		if ( d50 )
		{
			float d0 = FloatForKey( e, "_zero_percent_distance" );
			if ( d0 < d50 )
			{
				Warning( "light has _fifty_percent_distance of %f but _zero_percent_distance of %f\n", d50, d0);
				d0 = 2.0 * d50;
			}
			float a = 0, b = 1, c = 0;
			if ( ! SolveInverseQuadraticMonotonic( 0, 1.0, d50, 2.0, d0, 256.0, a, b, c ))
			{
				Warning( "can't solve quadratic for light %f %f\n", d50, d0 );
			}
			// it it possible that the parameters couldn't be used because of enforing monoticity. If so, rescale so at
			// least the 50 percent value is right
//		printf("50 percent=%f 0 percent=%f\n",d50,d0);
// 		printf("a=%f b=%f c=%f\n",a,b,c);
			float v50 = c + d50 * ( b + d50 * a );
			float scale = 2.0 / v50;
			a *= scale;
			b *= scale;
			c *= scale;
// 		printf("scaled=%f a=%f b=%f c=%f\n",scale,a,b,c);
// 		for(float d=0;d<200;d+=20)
// 			printf("at %f, %f\n",d,1.0/(c+d*(b+d*a)));
			dl->light.quadratic_attn = a;
			dl->light.linear_attn = b;
			dl->light.constant_attn = c;


			if ( IntForKey(e, "_hardfalloff" ) )
			{
				dl->m_flEndFadeDistance = d0;
				dl->m_flStartFadeDistance = 0.75 * d0 + 0.25 * d50;		// start fading 3/4 way between 50 and 0. could allow adjust
			}
			else
			{
				// now, we will find the point at which the 1/x term reaches its maximum value, and
				// prevent the light from going past there. If a user specifes an extreme falloff, the
				// quadratic will start making the light brighter at some distance. We handle this by
				// fading it from the minimum brightess point down to zero at 10x the minimum distance
				if ( fabs( a ) > 0. )
				{
					float flMax = b / ( - 2.0 * a );				// where f' = 0
					if ( flMax > 0.0 )
					{
						dl->m_flCapDist = flMax;
						dl->m_flStartFadeDistance = flMax;
						dl->m_flEndFadeDistance = 10.0 * flMax;
					}
				}
			}
		}
		else
		{
			dl->light.constant_attn = FloatForKey (e, "_constant_attn" );
			dl->light.linear_attn = FloatForKey (e, "_linear_attn" );
			dl->light.quadratic_attn = FloatForKey (e, "_quadratic_attn" );

			dl->light.radius = FloatForKey (e, "_distance");

			// clamp values to >= 0
			if ( dl->light.constant_attn < EQUAL_EPSILON )
				dl->light.constant_attn = 0;

			if ( dl->light.linear_attn < EQUAL_EPSILON )
				dl->light.linear_attn = 0;

			if ( dl->light.quadratic_attn < EQUAL_EPSILON )
				dl->light.quadratic_attn = 0;

			if ( dl->light.constant_attn < EQUAL_EPSILON && dl->light.linear_attn < EQUAL_EPSILON && dl->light.quadratic_attn < EQUAL_EPSILON )
				dl->light.constant_attn = 1;

			// scale intensity for unit 100 distance
			float ratio = ( dl->light.constant_attn + 100 * dl->light.linear_attn + 100 * 100 * dl->light.quadratic_attn );
			if ( ratio > 0 )
			{
				VectorScale( dl->light.intensity, ratio, dl->light.intensity );
			}
		}
	}
}

static void ParseLightSpot( entity_t* e, directlight_t* dl )
{
	Vector dest;
	GetVectorForKey (e, "origin", dest );
	dl = AllocDLight( dest, true );

	ParseLightGeneric( e, dl );

	dl->light.type = emit_spotlight;

	dl->light.stopdot = FloatForKey (e, "_inner_cone");
	if (!dl->light.stopdot)
		dl->light.stopdot = 10;

	dl->light.stopdot2 = FloatForKey (e, "_cone");
	if (!dl->light.stopdot2) 
		dl->light.stopdot2 = dl->light.stopdot;
	if (dl->light.stopdot2 < dl->light.stopdot)
		dl->light.stopdot2 = dl->light.stopdot;

	// This is a point light if stop dots are 180...
	if ((dl->light.stopdot == 180) && (dl->light.stopdot2 == 180))
	{
		dl->light.stopdot = dl->light.stopdot2 = 0;
		dl->light.type = emit_point;
		dl->light.exponent = 0;
	}
	else
	{
		// Clamp to 90, that's all DX8 can handle! 
		if (dl->light.stopdot > 90)
		{
			Warning("WARNING: light_spot at (%i %i %i) has inner angle larger than 90 degrees! Clamping to 90...\n",
					(int)dl->light.origin[0], (int)dl->light.origin[1], (int)dl->light.origin[2]);
			dl->light.stopdot = 90;
		}

		if (dl->light.stopdot2 > 90)
		{
			Warning("WARNING: light_spot at (%i %i %i) has outer angle larger than 90 degrees! Clamping to 90...\n",
					(int)dl->light.origin[0], (int)dl->light.origin[1], (int)dl->light.origin[2]);
			dl->light.stopdot2 = 90;
		}

		dl->light.stopdot2 = (float)cos(dl->light.stopdot2/180*M_PI);
		dl->light.stopdot = (float)cos(dl->light.stopdot/180*M_PI);
		dl->light.exponent = FloatForKey (e, "_exponent");
	}

	SetLightFalloffParams(e,dl);
}

// NOTE: This is just a heuristic.  It traces a finite number of rays to find sky
// NOTE: Full vis is necessary to make this 100% correct.
bool CanLeafTraceToSky( int iLeaf )
{
	// UNDONE: Really want a point inside the leaf here.  Center is a guess, may not be in the leaf
	// UNDONE: Clip this to each plane bounding the leaf to guarantee
	Vector center = vec3_origin;
	for ( int i = 0; i < 3; i++ )
	{
		center[i] = ( (float)(dleafs[iLeaf].mins[i] + dleafs[iLeaf].maxs[i]) ) * 0.5f;
	}

	FourVectors center4, delta;
	fltx4 fractionVisible;
	for ( int j = 0; j < NUMVERTEXNORMALS; j+=4 )
	{
		// search back to see if we can hit a sky brush
		delta.LoadAndSwizzle( g_anorms[j], g_anorms[min( j+1, NUMVERTEXNORMALS-1 )],
			g_anorms[min( j+2, NUMVERTEXNORMALS-1 )], g_anorms[min( j+3, NUMVERTEXNORMALS-1 )] );
		delta *= -MAX_TRACE_LENGTH;
		delta += center4;

		// return true if any hits sky
		TestLine_DoesHitSky ( center4, delta, &fractionVisible );
		if ( TestSignSIMD ( CmpGtSIMD ( fractionVisible, Four_Zeros ) ) )
			return true;
	}

	return false;
}

void BuildVisForLightEnvironment( int nNumLights, directlight_t** pLights )
{
	// FIXME: The work in this function is executed redundantly for multiple emit_skylight lights.

	// Create the vis.
	for ( int iLeaf = 0; iLeaf < numleafs; ++iLeaf )
	{
		dleafs[iLeaf].flags &= ~( LEAF_FLAGS_SKY | LEAF_FLAGS_SKY2D );
		unsigned int iFirstFace = dleafs[iLeaf].firstleafface;
		for ( int iLeafFace = 0; iLeafFace < dleafs[iLeaf].numleaffaces; ++iLeafFace )
		{
			unsigned int iFace = dleaffaces[iFirstFace+iLeafFace];
			
			texinfo_t &tex = texinfo[g_pFaces[iFace].texinfo];
			if ( tex.flags & SURF_SKY )
			{
				if ( tex.flags & SURF_SKY2D )
				{
					dleafs[iLeaf].flags |= LEAF_FLAGS_SKY2D;
				}
				else
				{
					dleafs[iLeaf].flags |= LEAF_FLAGS_SKY;
				}

				for ( int iLight = 0; iLight < nNumLights; ++iLight )
				{
					MergeDLightVis( pLights[iLight], dleafs[iLeaf].cluster );
				}
				break;
			}
		}
	}

	// Second pass to set flags on leaves that don't contain sky, but touch leaves that
	// contain sky.
	byte pvs[MAX_MAP_CLUSTERS / 8];

	int nLeafBytes = (numleafs >> 3) + 1;
	unsigned char *pLeafBits = (unsigned char *)stackalloc( nLeafBytes * sizeof(unsigned char) );
	unsigned char *pLeaf2DBits = (unsigned char *)stackalloc( nLeafBytes * sizeof(unsigned char) );
	memset( pLeafBits, 0, nLeafBytes );
	memset( pLeaf2DBits, 0, nLeafBytes );

	for ( int iLeaf = 0; iLeaf < numleafs; ++iLeaf )
	{
		// If this leaf has light (3d skybox) in it, then don't bother
		if ( dleafs[iLeaf].flags & LEAF_FLAGS_SKY )
			continue;

		// Don't bother with this leaf if it's solid
		if ( dleafs[iLeaf].contents & CONTENTS_SOLID )
			continue;

		// See what other leaves are visible from this leaf
		GetVisCache( -1, dleafs[iLeaf].cluster, pvs );

		// Now check out all other leaves
		int nByte = iLeaf >> 3;
		int nBit = 1 << ( iLeaf & 0x7 );
		for ( int iLeaf2 = 0; iLeaf2 < numleafs; ++iLeaf2 )
		{
			if ( iLeaf2 == iLeaf )
				continue;

			if ( !(dleafs[iLeaf2].flags & ( LEAF_FLAGS_SKY | LEAF_FLAGS_SKY2D ) ) )
				continue;

			// Can this leaf see into the leaf with the sky in it?
			if ( !PVSCheck( pvs, dleafs[iLeaf2].cluster ) )
				continue;

			if ( dleafs[iLeaf2].flags & LEAF_FLAGS_SKY2D )
			{
				pLeaf2DBits[ nByte ] |= nBit;
			}
			if ( dleafs[iLeaf2].flags & LEAF_FLAGS_SKY )
			{
				pLeafBits[ nByte ] |= nBit;

				// As soon as we know this leaf needs to draw the 3d skybox, we're done
				break;
			}
		}
	}

	// Must set the bits in a separate pass so as to not flood-fill LEAF_FLAGS_SKY everywhere
	// pLeafbits is a bit array of all leaves that need to be marked as seeing sky
	for ( int iLeaf = 0; iLeaf < numleafs; ++iLeaf )
	{
		// If this leaf has light (3d skybox) in it, then don't bother
		if ( dleafs[iLeaf].flags & LEAF_FLAGS_SKY )
			continue;

		// Don't bother with this leaf if it's solid
		if ( dleafs[iLeaf].contents & CONTENTS_SOLID )
			continue;

		// Check to see if this is a 2D skybox leaf
		if ( pLeaf2DBits[ iLeaf >> 3 ] & (1 << ( iLeaf & 0x7 )) )
		{
			dleafs[iLeaf].flags |= LEAF_FLAGS_SKY2D;
		}

		// If this is a 3D skybox leaf, then we don't care if it was previously a 2D skybox leaf
		if ( pLeafBits[ iLeaf >> 3 ] & (1 << ( iLeaf & 0x7 )) )
		{
			dleafs[iLeaf].flags |= LEAF_FLAGS_SKY;
			dleafs[iLeaf].flags &= ~LEAF_FLAGS_SKY2D;
		}
		else
		{
			// if radial vis was used on this leaf some of the portals leading
			// to sky may have been culled.  Try tracing to find sky.
			if ( dleafs[iLeaf].flags & LEAF_FLAGS_RADIAL )
			{
				if ( CanLeafTraceToSky(iLeaf) )
				{
					// FIXME: Should make a version that checks if we hit 2D skyboxes.. oh well.
					dleafs[iLeaf].flags |= LEAF_FLAGS_SKY;
				}
			}
		}
	}
}

static char *ValueForKeyWithDefault (entity_t *ent, char *key, char *default_value = NULL)
{
	epair_t	*ep;
	
	for (ep=ent->epairs ; ep ; ep=ep->next)
		if (!strcmp (ep->key, key) )
			return ep->value;
	return default_value;
}

static void ParseLightEnvironment( entity_t* e, directlight_t* dl )
{
	Vector dest;
	GetVectorForKey (e, "origin", dest );
	dl = AllocDLight( dest, false );

	ParseLightGeneric( e, dl );

	if ( !gSkyLight )
	{
		char *angle_str=ValueForKeyWithDefault( e, "SunSpreadAngle" );
		if (angle_str)
		{
			g_SunAngularExtent=atof(angle_str);
			g_SunAngularExtent=sin((M_PI/180.0)*g_SunAngularExtent);
			dl->m_flSkyLightSunAngularExtent = g_SunAngularExtent;
			printf("sun extent from map=%f\n",g_SunAngularExtent);
		}

		// Sky light.
		gSkyLight = dl;
		dl->light.type = emit_skylight;

		// Sky ambient light.
		gAmbient = AllocDLight( dl->light.origin, false );
		gAmbient->light.type = emit_skyambient;
		if( g_bHDR && LightForKey( e, "_ambientHDR", gAmbient->light.intensity ) )
		{
			// we have a valid HDR ambient light value
		}
		else if ( !LightForKey( e, "_ambient", gAmbient->light.intensity ) )
		{
			VectorScale( dl->light.intensity, 0.5, gAmbient->light.intensity );
		}
		if ( g_bHDR )
		{
			VectorScale( gAmbient->light.intensity, 
						 FloatForKeyWithDefault( e, "_AmbientScaleHDR", 1.0 ), 
						 gAmbient->light.intensity );
		}
		
		// skylight and ambient light never cast entity shadows
		gSkyLight->light.flags &= ~DWL_FLAGS_CASTENTITYSHADOWS;
		gAmbient->light.flags &= ~DWL_FLAGS_CASTENTITYSHADOWS;

		directlight_t* lights[] = { gSkyLight, gAmbient };
		BuildVisForLightEnvironment( 2, lights );
 
		// Add sky and sky ambient lights to the list.
		AddDLightToActiveList( gSkyLight );
		AddDLightToActiveList( gAmbient );
	}
}

static void ParseLightDirectional( entity_t* e, directlight_t* dl )
{
	Vector dest;
	GetVectorForKey (e, "origin", dest );
	dl = AllocDLight( dest, true );

	ParseLightGeneric( e, dl );

	char *angle_str=ValueForKeyWithDefault( e, "SunSpreadAngle" );
	if (angle_str)
	{
		dl->m_flSkyLightSunAngularExtent = atof(angle_str);
		dl->m_flSkyLightSunAngularExtent = sin((M_PI/180.0)*dl->m_flSkyLightSunAngularExtent);
	}

	dl->light.type = emit_skylight;
	// For the engine, emit_skylight is the type we want.
	// Set an additional flag identifying this as "not the global skylight" for vrad. This will cause it to use the angular extent associated with this light
	// instead of the global one.
	dl->m_bSkyLightIsDirectionalLight = true;

	// directional lights never cast entity shadows
	dl->light.flags &= ~DWL_FLAGS_CASTENTITYSHADOWS;

	BuildVisForLightEnvironment( 1, &dl );
}

static void ParseLightPoint( entity_t* e, directlight_t* dl )
{
	Vector dest;
	GetVectorForKey (e, "origin", dest );
	dl = AllocDLight( dest, true );

	ParseLightGeneric( e, dl );

	dl->light.type = emit_point;

	SetLightFalloffParams(e,dl);
}

/*
  =============
  CreateDirectLights
  =============
*/
#define DIRECT_SCALE (100.0*100.0)
void CreateDirectLights (void)
{
	unsigned        i;
	CPatch	        *p = NULL;
	directlight_t	*dl = NULL;
	entity_t	    *e = NULL;
	char	        *name;
	Vector	        dest;

	numdlights = 0;

	FreeDLights();

	//
	// surfaces
	//
	unsigned int uiPatchCount = g_Patches.Count();
	for (i=0; i< uiPatchCount; i++)
	{
		p = &g_Patches.Element( i );

		// skip parent patches
		if (p->child1 != g_Patches.InvalidIndex() )
			continue;

		if (p->basearea < 1e-6)
			continue;

		if( VectorAvg( p->baselight ) >= dlight_threshold )
		{
			dl = AllocDLight( p->origin, true );

			dl->light.type = emit_surface;
			dl->light.flags &= ~DWL_FLAGS_CASTENTITYSHADOWS;

			VectorCopy (p->normal, dl->light.normal);
			Assert( VectorLength( p->normal ) > 1.0e-20 );
			// scale intensity by number of texture instances
			VectorScale( p->baselight, lightscale * p->area * p->scale[0] * p->scale[1] / p->basearea, dl->light.intensity );

			// scale to a range that results in actual light
			VectorScale( dl->light.intensity, DIRECT_SCALE, dl->light.intensity );
		}
	}
	
	//
	// entities
	//
	for (i=0 ; i<(unsigned)num_entities ; i++)
	{
		e = &entities[i];
		name = ValueForKey (e, "classname");
		if (strncmp (name, "light", 5))
			continue;

		// Light_dynamic is actually a real entity; not to be included here...
		if (!strcmp (name, "light_dynamic"))
			continue;

		if (!strcmp (name, "light_spot"))
		{
			ParseLightSpot( e, dl );
		}
		else if (!strcmp(name, "light_environment")) 
		{
			ParseLightEnvironment( e, dl );
		}
		else if (!strcmp(name, "light_directional")) 
		{
			ParseLightDirectional( e, dl );
		}
		else if (!strcmp(name, "light")) 
		{
			ParseLightPoint( e, dl );
		}
		else
		{
			qprintf( "unsupported light entity: \"%s\"\n", name );
		}
	}

	qprintf ("%i direct lights\n", numdlights);
	// exit(1);
}

/*
  =============
  ExportDirectLightsToWorldLights
  =============
*/

void ExportDirectLightsToWorldLights()
{
	directlight_t		*dl;

	// In case the level has already been VRADed.
	*pNumworldlights = 0;

	for (dl = activelights; dl != NULL; dl = dl->next )
	{
		dworldlight_t *wl = &dworldlights[(*pNumworldlights)++];

		if (*pNumworldlights > MAX_MAP_WORLDLIGHTS)
		{
			Error("too many lights %d / %d\n", *pNumworldlights, MAX_MAP_WORLDLIGHTS );
		}

		wl->cluster	= dl->light.cluster;
		wl->type	= dl->light.type;
		wl->style	= dl->light.style;
		VectorCopy( dl->light.origin, wl->origin );
		// FIXME: why does vrad want 0 to 255 and not 0 to 1??
		VectorScale( dl->light.intensity, (1.0 / 255.0), wl->intensity );
		VectorCopy( dl->light.normal, wl->normal );
		VectorCopy( dl->light.shadow_cast_offset, wl->shadow_cast_offset );
		wl->stopdot	= dl->light.stopdot;
		wl->stopdot2 = dl->light.stopdot2;
		wl->exponent = dl->light.exponent;
		wl->radius = dl->light.radius;
		wl->constant_attn = dl->light.constant_attn;
		wl->linear_attn = dl->light.linear_attn;
		wl->quadratic_attn = dl->light.quadratic_attn;
		wl->flags = dl->light.flags;
	}
}

/*
  =============
  GatherSampleLight
  =============
*/
#define NORMALFORMFACTOR	40.156979 // accumuated dot products for hemisphere

#define CONSTANT_DOT (.7/2)

#define NSAMPLES_SUN_AREA_LIGHT 300							// number of samples to take for an
                                                            // non-point sun light

// Helper function - gathers light from sun (emit_skylight)
void GatherSampleSkyLightSSE( SSE_sampleLightOutput_t &out, directlight_t *dl, int facenum, 
							 FourVectors const& pos, FourVectors *pNormals, int normalCount, int iThread,
							 int nLFlags, int static_prop_index_to_ignore,
							 float flEpsilon )
{
	bool bIgnoreNormals = ( nLFlags & GATHERLFLAGS_IGNORE_NORMALS ) != 0;
	bool force_fast = ( nLFlags & GATHERLFLAGS_FORCE_FAST ) != 0;
	fltx4 dot;

	float fSunAngularExtent = g_SunAngularExtent;
	if ( dl->m_bSkyLightIsDirectionalLight )
	{
		fSunAngularExtent = dl->m_flSkyLightSunAngularExtent;
	}

	if ( bIgnoreNormals )
		dot = ReplicateX4( CONSTANT_DOT );
	else
		dot = NegSIMD( pNormals[0] * dl->light.normal );

	dot = MaxSIMD( dot, Four_Zeros );

	dot = SoftenCosineTerm( dot );

	int zeroMask = TestSignSIMD ( CmpEqSIMD( dot, Four_Zeros ) );
	if (zeroMask == 0xF)
		return;

	int nsamples = 1;
	if ( fSunAngularExtent > 0.0f )
	{
		nsamples = NSAMPLES_SUN_AREA_LIGHT;
		if ( do_fast || force_fast )
			nsamples /= 4;
	}

	fltx4 totalFractionVisible = Four_Zeros;
	fltx4 fractionVisible = Four_Zeros;

	DirectionalSampler_t sampler;

	for ( int d = 0; d < nsamples; d++ )
	{
		// determine visibility of skylight
		// serach back to see if we can hit a sky brush
		Vector delta;
		VectorScale( dl->light.normal, -MAX_TRACE_LENGTH, delta );
		if ( d )
		{
			// jitter light source location
			Vector ofs = sampler.NextValue();
			ofs *= MAX_TRACE_LENGTH * fSunAngularExtent;
			delta += ofs;
		}
		FourVectors delta4;
		delta4.DuplicateVector ( delta );
		delta4 += pos;

		TestLine_DoesHitSky ( pos, delta4, &fractionVisible, true, static_prop_index_to_ignore );

		totalFractionVisible = AddSIMD ( totalFractionVisible, fractionVisible );
	}

	fltx4 seeAmount = MulSIMD ( totalFractionVisible, ReplicateX4 ( 1.0f / nsamples ) );

	out.m_flDot[0] = MulSIMD ( dot, seeAmount );
	out.m_flFalloff = Four_Ones;
	out.m_flSunAmount[0] = MulSIMD( out.m_flDot[0], out.m_flFalloff );

	for ( int i = 1; i < normalCount; i++ )
	{
		if ( bIgnoreNormals )
		{
			out.m_flDot[i] = ReplicateX4( CONSTANT_DOT );
			out.m_flSunAmount[i] = Four_Zeros;
		}
		else
		{
			out.m_flDot[i] = NegSIMD( pNormals[i] * dl->light.normal );
			out.m_flDot[i] = MaxSIMD( out.m_flDot[i], Four_Zeros );
			out.m_flDot[i] = SoftenCosineTerm( out.m_flDot[i] );
			out.m_flDot[i] = MulSIMD( out.m_flDot[i], seeAmount );
			out.m_flSunAmount[i] = MulSIMD( out.m_flDot[i], out.m_flFalloff );
		}
	}
}

// Helper function - gathers light from ambient sky light
void GatherSampleAmbientSkySSE( SSE_sampleLightOutput_t &out, directlight_t *dl, int facenum, 
							   FourVectors const& pos, FourVectors *pNormals, int normalCount, int iThread,
							   int nLFlags, int static_prop_index_to_ignore,
							   float flEpsilon )
{

	bool bIgnoreNormals = ( nLFlags & GATHERLFLAGS_IGNORE_NORMALS ) != 0;
	bool force_fast = ( nLFlags & GATHERLFLAGS_FORCE_FAST ) != 0;

	fltx4 sumdot = Four_Zeros;
	fltx4 ambient_intensity[NUM_BUMP_VECTS+1];
	fltx4 possibleHitCount[NUM_BUMP_VECTS+1];
	fltx4 dots[NUM_BUMP_VECTS+1];

	for ( int i = 0; i < normalCount; i++ )
	{
		ambient_intensity[i] = Four_Zeros;
		possibleHitCount[i] = Four_Zeros;
	}

	DirectionalSampler_t sampler;
	int nsky_samples = NUMVERTEXNORMALS;
	if (do_fast || force_fast )
		nsky_samples /= 4;
	else
		nsky_samples *= g_flSkySampleScale;

	for (int j = 0; j < nsky_samples; j++)
	{
		FourVectors anorm;
		anorm.DuplicateVector( sampler.NextValue() );

		if ( bIgnoreNormals )
			dots[0] = ReplicateX4( CONSTANT_DOT );
		else
			dots[0] = NegSIMD( pNormals[0] * anorm );

		dots[0] = SoftenCosineTerm( dots[0] );

		fltx4 validity = CmpGtSIMD( dots[0], ReplicateX4( EQUAL_EPSILON ) );

		// No possibility of anybody getting lit
		if ( !TestSignSIMD( validity ) )
			continue;

		dots[0] = AndSIMD( validity, dots[0] );
		sumdot = AddSIMD( dots[0], sumdot );
		possibleHitCount[0] = AddSIMD( AndSIMD( validity, Four_Ones ), possibleHitCount[0] );

		for ( int i = 1; i < normalCount; i++ )
		{
			if ( bIgnoreNormals )
				dots[i] = ReplicateX4( CONSTANT_DOT );
			else
				dots[i] = NegSIMD( pNormals[i] * anorm );

			dots[i] = SoftenCosineTerm( dots[i] );

			fltx4 validity2 = CmpGtSIMD( dots[i], ReplicateX4 ( EQUAL_EPSILON ) );
			dots[i] = AndSIMD( validity2, dots[i] );
			possibleHitCount[i] = AddSIMD( AndSIMD( AndSIMD( validity, validity2 ), Four_Ones ), possibleHitCount[i] );
		}

		// search back to see if we can hit a sky brush
		FourVectors delta = anorm;
		delta *= -MAX_TRACE_LENGTH;
		delta += pos;
		FourVectors surfacePos = pos;
		FourVectors offset = anorm;
		offset *= -flEpsilon;
		surfacePos -= offset;

		fltx4 fractionVisible = Four_Ones;
		TestLine_DoesHitSky( surfacePos, delta, &fractionVisible, true, static_prop_index_to_ignore );
		for ( int i = 0; i < normalCount; i++ )
		{
			fltx4 addedAmount = MulSIMD( fractionVisible, dots[i] );
			ambient_intensity[i] = AddSIMD( ambient_intensity[i], addedAmount );
		}

	}

	out.m_flFalloff = Four_Ones;
	for ( int i = 0; i < normalCount; i++ )
	{
		// now scale out the missing parts of the hemisphere of this bump basis vector
		fltx4 factor = ReciprocalSIMD( possibleHitCount[0] );
		factor = MulSIMD( factor, possibleHitCount[i] );
		out.m_flDot[i] = MulSIMD( factor, sumdot );
		out.m_flDot[i] = ReciprocalSIMD( out.m_flDot[i] );
		out.m_flDot[i] = MulSIMD( ambient_intensity[i], out.m_flDot[i] );
		out.m_flSunAmount[i] = Four_Zeros;
	}
}

// Helper function - gathers light from area lights, spot lights, and point lights
void GatherSampleStandardLightSSE( SSE_sampleLightOutput_t &out, directlight_t *dl, int facenum, 
								  FourVectors const& pos, FourVectors *pNormals, int normalCount, int iThread,
								  int nLFlags, int static_prop_index_to_ignore,
								  float flEpsilon )
{
	bool bIgnoreNormals = ( nLFlags & GATHERLFLAGS_IGNORE_NORMALS ) != 0;

	FourVectors src;
	src.DuplicateVector( vec3_origin );

	if (dl->facenum == -1)
	{
		src.DuplicateVector( dl->light.origin );
	}

	// Find light vector
	FourVectors delta;
	delta = src;
	delta -= pos;
	fltx4 dist2 = delta.length2();
	fltx4 rpcDist = ReciprocalSqrtSIMD( dist2 );
	delta *= rpcDist;
	fltx4 dist = SqrtEstSIMD( dist2 );//delta.VectorNormalize();

	// Compute dot
	fltx4 dot = ReplicateX4( (float) CONSTANT_DOT );
	if ( !bIgnoreNormals )
		dot = delta * pNormals[0];
	dot = MaxSIMD( Four_Zeros, dot );

	dot = SoftenCosineTerm( dot );

	// Affix dot to zero if past fade distz
	bool bHasHardFalloff = ( dl->m_flEndFadeDistance > dl->m_flStartFadeDistance );
	if ( bHasHardFalloff )
	{
		fltx4 notPastFadeDist = CmpLeSIMD ( dist, ReplicateX4 ( dl->m_flEndFadeDistance ) );
		dot = AndSIMD( dot, notPastFadeDist );  // dot = 0 if past fade distance
		if ( !TestSignSIMD ( notPastFadeDist ) )
			return;
	}

	dist = MaxSIMD( dist, Four_Ones );
	fltx4 falloffEvalDist = MinSIMD( dist, ReplicateX4( dl->m_flCapDist ) );

	fltx4 constant, linear, quadratic;
	fltx4 dot2, inCone, inFringe, mult;
	FourVectors offset;

	switch (dl->light.type)
	{
	case emit_point:
		constant  = ReplicateX4( dl->light.constant_attn );
		linear    = ReplicateX4( dl->light.linear_attn );
		quadratic = ReplicateX4( dl->light.quadratic_attn );

		out.m_flFalloff = MulSIMD( falloffEvalDist, falloffEvalDist );
		out.m_flFalloff = MulSIMD( out.m_flFalloff, quadratic );
		out.m_flFalloff = AddSIMD( out.m_flFalloff, MulSIMD( linear, falloffEvalDist ) );
		out.m_flFalloff = AddSIMD( out.m_flFalloff, constant );
		if ( g_bFiniteFalloffModel )
		{
			out.m_flFalloff = MaxSIMD( Four_Zeros, out.m_flFalloff );
		}
		else
		{
			out.m_flFalloff = ReciprocalSIMD( out.m_flFalloff );
		}
		break;

	case emit_surface:
		dot2 = delta * dl->light.normal;
		dot2 = NegSIMD( dot2 );

		// Light behind surface yields zero dot
		dot2 = MaxSIMD( Four_Zeros, dot2 );
		if ( TestSignSIMD( CmpEqSIMD( Four_Zeros, dot ) ) == 0xF )
			return;

		out.m_flFalloff = ReciprocalSIMD ( dist2 );
		out.m_flFalloff = MulSIMD( out.m_flFalloff, dot2 );

		// move the endpoint away from the surface by epsilon to prevent hitting the surface with the trace
		offset.DuplicateVector ( dl->light.normal );
		offset *= DIST_EPSILON;
		src += offset;
		break;

	case emit_spotlight:
		dot2 = delta * dl->light.normal;
		dot2 = NegSIMD( dot2 );

		// Affix dot2 to zero if outside light cone
		inCone = CmpGtSIMD( dot2, ReplicateX4( dl->light.stopdot2 ) );
		if ( !TestSignSIMD ( inCone ) )
			return;
		dot = AndSIMD( inCone, dot );

		constant  = ReplicateX4( dl->light.constant_attn );
		linear    = ReplicateX4( dl->light.linear_attn );
		quadratic = ReplicateX4( dl->light.quadratic_attn );

		out.m_flFalloff = MulSIMD( falloffEvalDist, falloffEvalDist );
		out.m_flFalloff = MulSIMD( out.m_flFalloff, quadratic );
		out.m_flFalloff = AddSIMD( out.m_flFalloff, MulSIMD( linear, falloffEvalDist ) );
		out.m_flFalloff = AddSIMD( out.m_flFalloff, constant );
		if ( g_bFiniteFalloffModel )
		{
			out.m_flFalloff = MaxSIMD( Four_Zeros, out.m_flFalloff );
		}
		else
		{
			out.m_flFalloff = ReciprocalSIMD( out.m_flFalloff );
		}
		out.m_flFalloff = MulSIMD( out.m_flFalloff, dot2 );

		// outside the inner cone
		inFringe = CmpLeSIMD( dot2, ReplicateX4( dl->light.stopdot ) );
		mult = ReplicateX4( dl->light.stopdot - dl->light.stopdot2 );
		mult = ReciprocalSIMD( mult );
		mult = MulSIMD( mult, SubSIMD( dot2, ReplicateX4( dl->light.stopdot2 ) ) );
		mult = MinSIMD( mult, Four_Ones );
		mult = MaxSIMD( mult, Four_Zeros );

		// pow is fixed point, so this isn't the most accurate, but it doesn't need to be
		if ( (dl->light.exponent != 0.0f ) && ( dl->light.exponent != 1.0f ) )
			mult = PowSIMD( mult, dl->light.exponent );

		// if not in between inner and outer cones, mult by 1
		mult = AndSIMD( inFringe, mult );
		mult = AddSIMD( mult, AndNotSIMD( inFringe, Four_Ones ) );
		out.m_flFalloff = MulSIMD( mult, out.m_flFalloff );
		break;

	}

	// we may be in the fade region - modulate lighting by the fade curve
	//float t = ( dist - dl->m_flStartFadeDistance ) / 
	//	( dl->m_flEndFadeDistance - dl->m_flStartFadeDistance );
	if ( bHasHardFalloff )
	{
		fltx4 t = ReplicateX4( dl->m_flEndFadeDistance - dl->m_flStartFadeDistance );
		t = ReciprocalSIMD( t );
		t = MulSIMD( t, SubSIMD( dist, ReplicateX4( dl->m_flStartFadeDistance ) ) );

		// clamp t to [0...1]
		t = MinSIMD( t, Four_Ones );
		t = MaxSIMD( t, Four_Zeros );
		t = SubSIMD( Four_Ones, t );

		// Using QuinticInterpolatingPolynomial, SSE-ified
		// t * t * t *( t * ( t* 6.0 - 15.0 ) + 10.0 )
		mult = SubSIMD( MulSIMD( ReplicateX4( 6.0f ), t ), ReplicateX4( 15.0f ) );
		mult = AddSIMD( MulSIMD( mult, t ), ReplicateX4( 10.0f ) );
		mult = MulSIMD( MulSIMD( t, t), mult );
		mult = MulSIMD( t, mult );
		out.m_flFalloff = MulSIMD( mult, out.m_flFalloff );
	}

	if ( !( nLFlags & GATHERLFLAGS_NO_OCCLUSION ) )
	{
		// Raytrace for visibility function
		fltx4 fractionVisible = Four_Ones;
		TestLine( pos, src, &fractionVisible, static_prop_index_to_ignore);
		dot = MulSIMD( fractionVisible, dot );
	}

	out.m_flDot[0] = dot;
	for ( int i = 1; i < normalCount; i++ )
	{
		if ( bIgnoreNormals )
		{
			out.m_flDot[i] = ReplicateX4( (float)CONSTANT_DOT );
			out.m_flSunAmount[i] = Four_Zeros; 
		}
		else
		{
			out.m_flDot[i] = pNormals[i] * delta;
			out.m_flDot[i] = MaxSIMD( Four_Zeros, out.m_flDot[i] );
			out.m_flSunAmount[i] = Four_Zeros; 
		}
	}
}

// returns dot product with normal and delta
// dl - light
// pos - position of sample
// normal - surface normal of sample
// out.m_flDot[] - returned dot products with light vector and each normal
// out.m_flFalloff - amount of light falloff
void GatherSampleLightSSE( SSE_sampleLightOutput_t &out, directlight_t *dl, int facenum, 
					   FourVectors const& pos, FourVectors *pNormals, int normalCount, int iThread,
					   int nLFlags,
					   int static_prop_index_to_ignore,
					   float flEpsilon )
{
	for ( int b = 0; b < normalCount; b++ )
	{
		out.m_flDot[b] = Four_Zeros;
		out.m_flSunAmount[b] = Four_Zeros;
	}
	out.m_flFalloff = Four_Zeros;
	Assert( normalCount <= (NUM_BUMP_VECTS+1) );

	// skylights work fundamentally differently than normal lights
	switch( dl->light.type )
	{
	case emit_skylight:
		GatherSampleSkyLightSSE( out, dl, facenum, pos, pNormals, normalCount,
		                         iThread, nLFlags, static_prop_index_to_ignore, flEpsilon );
		break;
	case emit_skyambient:
		GatherSampleAmbientSkySSE( out, dl, facenum, pos, pNormals, normalCount,
		                           iThread, nLFlags, static_prop_index_to_ignore, flEpsilon );
		break;
	case emit_point:
	case emit_surface:
	case emit_spotlight:
		GatherSampleStandardLightSSE( out, dl, facenum, pos, pNormals, normalCount,
		                              iThread, nLFlags, static_prop_index_to_ignore, flEpsilon );
		break;
	default:
		Error ("Bad dl->light.type");
		return;
	}

	// Ambient occlusion for the 4 sample positions & normals
	fltx4 ao = Four_Ones;
	bool bIgnoreNormals = (nLFlags & GATHERLFLAGS_IGNORE_NORMALS) != 0;
	if ( !bIgnoreNormals ) // Don't calculate ambient occlusion for objects that ignore normals for gathering light
	{
		if ( nLFlags & GATHERLFLAGS_STATICPROP )
		{
			// for static props we want the sun amount for the basis normals to be mutliplied by the ao of the main vertex normal only.
			// lightmaps using this path do not send basis normals here, we do so for static props to take advantage of the SIMD optimisation this path provides.
			ao = CalculateAmbientOcclusion4( pos, *pNormals, static_prop_index_to_ignore );
			fltx4 ao0 = SplatXSIMD( ao );
			ao = ao0;
		}
		else
		{
			ao = CalculateAmbientOcclusion4( pos, *pNormals, static_prop_index_to_ignore );
		}
	}

	// NOTE: Notice here that if the light is on the back side of the face
	// (tested by checking the dot product of the face normal and the light position)
	// we don't want it to contribute to *any* of the bumped lightmaps. It glows
	// in disturbing ways if we don't do this.
	out.m_flDot[0] = MaxSIMD ( out.m_flDot[0], Four_Zeros );
	fltx4 notZero = CmpGtSIMD( out.m_flDot[0], Four_Zeros );
	out.m_flDot[0] = MulSIMD( out.m_flDot[0], ao );
	out.m_flSunAmount[0] = MulSIMD( out.m_flSunAmount[0], ao );
	for ( int n = 1; n < normalCount; n++ )
	{
		out.m_flDot[n] = MaxSIMD( out.m_flDot[n], Four_Zeros );
		out.m_flDot[n] = AndSIMD( out.m_flDot[n], notZero );
		out.m_flDot[n] = MulSIMD( out.m_flDot[n], ao );
		out.m_flSunAmount[n] = MulSIMD( out.m_flSunAmount[n], ao );
	}
}

/*
  =============
  AddSampleToPatch

  Take the sample's collected light and
  add it back into the apropriate patch
  for the radiosity pass.
  =============
*/
void AddSampleToPatch (sample_t *s, LightingValue_t& light, int facenum)
{
	CPatch	*patch;
	Vector	mins, maxs;
	int		i;

	if (numbounce == 0)
		return;
	if( VectorAvg( light.m_vecLighting ) < 1)
		return;

	//
	// fixed the sample position and normal -- need to find the equiv pos, etc to set up 
	// patches
	//
	if( g_FacePatches.Element( facenum ) == g_FacePatches.InvalidIndex() )
		return;

	float radius = sqrt( s->area ) / 2.0;

	CPatch *pNextPatch = NULL;
	for( patch = &g_Patches.Element( g_FacePatches.Element( facenum ) ); patch; patch = pNextPatch )
	{
		// next patch
		pNextPatch = NULL;
		if( patch->ndxNext != g_Patches.InvalidIndex() )
		{
			pNextPatch = &g_Patches.Element( patch->ndxNext );
		}

		if (patch->sky)
			continue;

		// skip patches with children
		if ( patch->child1 != g_Patches.InvalidIndex() )
		 	continue;

		// see if the point is in this patch (roughly)
		WindingBounds (patch->winding, mins, maxs);

		for (i=0 ; i<3 ; i++)
		{
			if (mins[i] > s->pos[i] + radius)
				goto nextpatch;
			if (maxs[i] < s->pos[i] - radius)
				goto nextpatch;
		}

		// add the sample to the patch
		patch->samplearea += s->area;
		VectorMA( patch->samplelight, s->area, light.m_vecLighting, patch->samplelight );

 nextpatch:;
	}
	// don't worry if some samples don't find a patch
}


void GetPhongNormal( int facenum, Vector const& spot, Vector& phongnormal )
{
	int	j;
	dface_t		*f = &g_pFaces[facenum];
//	dplane_t	*p = &dplanes[f->planenum];
	Vector		facenormal, vspot;

	VectorCopy( dplanes[f->planenum].normal, facenormal );
	VectorCopy( facenormal, phongnormal );

	if ( smoothing_threshold != 1 )
	{
		faceneighbor_t *fn = &faceneighbor[facenum];

		// Calculate modified point normal for surface
		// Use the edge normals iff they are defined.  Bend the surface towards the edge normal(s)
		// Crude first attempt: find nearest edge normal and do a simple interpolation with facenormal.
		// Second attempt: find edge points+center that bound the point and do a three-point triangulation(baricentric)
		// Better third attempt: generate the point normals for all vertices and do baricentric triangulation.

		for (j=0 ; j<f->numedges ; j++)
		{
			Vector	v1, v2;
			//int e = dsurfedges[f->firstedge + j];
			//int e1 = dsurfedges[f->firstedge + ((j+f->numedges-1)%f->numedges)];
			//int e2 = dsurfedges[f->firstedge + ((j+1)%f->numedges)];

			//edgeshare_t	*es = &edgeshare[abs(e)];
			//edgeshare_t	*es1 = &edgeshare[abs(e1)];
			//edgeshare_t	*es2 = &edgeshare[abs(e2)];
			// dface_t	*f2;
			float		a1, a2, aa, bb, ab;
			int			vert1, vert2;

			Vector& n1 = fn->normal[j];
			Vector& n2 = fn->normal[(j+1)%f->numedges];

			/*
			  if (VectorCompare( n1, fn->facenormal ) 
			  && VectorCompare( n2, fn->facenormal) )
			  continue;
			*/

			vert1 = EdgeVertex( f, j );
			vert2 = EdgeVertex( f, j+1 );

			Vector& p1 = dvertexes[vert1].point;
			Vector& p2 = dvertexes[vert2].point;

			// Build vectors from the middle of the face to the edge vertexes and the sample pos.
			VectorSubtract( p1, face_centroids[facenum], v1 );
			VectorSubtract( p2, face_centroids[facenum], v2 );
			VectorSubtract( spot, face_centroids[facenum], vspot );
			aa = DotProduct( v1, v1 );
			bb = DotProduct( v2, v2 );
			ab = DotProduct( v1, v2 );
			a1 = (bb * DotProduct( v1, vspot ) - ab * DotProduct( vspot, v2 )) / (aa * bb - ab * ab);
			a2 = (DotProduct( vspot, v2 ) - a1 * ab) / bb;

			// Test center to sample vector for inclusion between center to vertex vectors (Use dot product of vectors)
			if ( a1 >= 0.0 && a2 >= 0.0)
			{
				// calculate distance from edge to pos
				Vector	temp;
				float scale;
				
				// Interpolate between the center and edge normals based on sample position
				scale = 1.0 - a1 - a2;
				VectorScale( fn->facenormal, scale, phongnormal );
				VectorScale( n1, a1, temp );
				VectorAdd( phongnormal, temp, phongnormal );
				VectorScale( n2, a2, temp );
				VectorAdd( phongnormal, temp, phongnormal );
				Assert( VectorLength( phongnormal ) > 1.0e-20 );
				VectorNormalize( phongnormal );

				/*
				  if (a1 > 1 || a2 > 1 || a1 + a2 > 1)
				  {
				  Msg("\n%.2f %.2f\n", a1, a2 );
				  Msg("%.2f %.2f %.2f\n", v1[0], v1[1], v1[2] );
				  Msg("%.2f %.2f %.2f\n", v2[0], v2[1], v2[2] );
				  Msg("%.2f %.2f %.2f\n", vspot[0], vspot[1], vspot[2] );
				  exit(1);

				  a1 = 0;
				  }
				*/
				/*
				  phongnormal[0] = (((j + 1) & 4) != 0) * 255;
				  phongnormal[1] = (((j + 1) & 2) != 0) * 255;
				  phongnormal[2] = (((j + 1) & 1) != 0) * 255;
				*/
				return;
			}
		}
	}
}

void GetPhongNormal( int facenum, FourVectors const& spot, FourVectors& phongnormal )
{
	int	j;
	dface_t		*f = &g_pFaces[facenum];
	//	dplane_t	*p = &dplanes[f->planenum];
	Vector		facenormal;
	FourVectors vspot;

	VectorCopy( dplanes[f->planenum].normal, facenormal );
	phongnormal.DuplicateVector( facenormal );

	FourVectors faceCentroid;
	faceCentroid.DuplicateVector( face_centroids[facenum] );

	if ( smoothing_threshold != 1 )
	{
		faceneighbor_t *fn = &faceneighbor[facenum];

		// Calculate modified point normal for surface
		// Use the edge normals iff they are defined.  Bend the surface towards the edge normal(s)
		// Crude first attempt: find nearest edge normal and do a simple interpolation with facenormal.
		// Second attempt: find edge points+center that bound the point and do a three-point triangulation(baricentric)
		// Better third attempt: generate the point normals for all vertices and do baricentric triangulation.

		for ( j = 0; j < f->numedges; ++j )
		{
			Vector	v1, v2;
			fltx4		a1, a2;
			float		aa, bb, ab;
			int			vert1, vert2;

			Vector& n1 = fn->normal[j];
			Vector& n2 = fn->normal[(j+1)%f->numedges];

			vert1 = EdgeVertex( f, j );
			vert2 = EdgeVertex( f, j+1 );

			Vector& p1 = dvertexes[vert1].point;
			Vector& p2 = dvertexes[vert2].point;

			// Build vectors from the middle of the face to the edge vertexes and the sample pos.
			VectorSubtract( p1, face_centroids[facenum], v1 );
			VectorSubtract( p2, face_centroids[facenum], v2 );
			//VectorSubtract( spot, face_centroids[facenum], vspot );
			vspot = spot;
			vspot -= faceCentroid;
			aa = DotProduct( v1, v1 );
			bb = DotProduct( v2, v2 );
			ab = DotProduct( v1, v2 );
			//a1 = (bb * DotProduct( v1, vspot ) - ab * DotProduct( vspot, v2 )) / (aa * bb - ab * ab);
			a1 = ReciprocalSIMD( ReplicateX4( aa * bb - ab * ab ) );
			a1 = MulSIMD( a1, SubSIMD( MulSIMD( ReplicateX4( bb ), vspot * v1 ), MulSIMD( ReplicateX4( ab ), vspot * v2 ) ) );
			//a2 = (DotProduct( vspot, v2 ) - a1 * ab) / bb;
			a2 = ReciprocalSIMD( ReplicateX4( bb ) );
			a2 = MulSIMD( a2, SubSIMD( vspot * v2, MulSIMD( a1, ReplicateX4( ab ) ) ) );

			fltx4 resultMask = AndSIMD( CmpGeSIMD( a1, Four_Zeros ), CmpGeSIMD( a2, Four_Zeros ) );
			
			if ( !TestSignSIMD( resultMask ) )
				continue;

			// Store the old phong normal to avoid overwriting already computed phong normals
			FourVectors oldPhongNormal = phongnormal;

			// calculate distance from edge to pos
			FourVectors	temp;
			fltx4 scale;

			// Interpolate between the center and edge normals based on sample position
			scale = SubSIMD( SubSIMD( Four_Ones, a1 ), a2 );
			phongnormal.DuplicateVector( fn->facenormal );
			phongnormal *= scale;
			temp.DuplicateVector( n1 );
			temp *= a1;
			phongnormal += temp;
			temp.DuplicateVector( n2 );
			temp *= a2;
			phongnormal += temp;

			// restore the old phong normals
			phongnormal.x = AddSIMD( AndSIMD( resultMask, phongnormal.x ), AndNotSIMD( resultMask, oldPhongNormal.x ) );
			phongnormal.y = AddSIMD( AndSIMD( resultMask, phongnormal.y ), AndNotSIMD( resultMask, oldPhongNormal.y ) );
			phongnormal.z = AddSIMD( AndSIMD( resultMask, phongnormal.z ), AndNotSIMD( resultMask, oldPhongNormal.z ) );
		}

		phongnormal.VectorNormalize();
	}
}


int GetVisCache( int lastoffset, int cluster, byte *pvs )
{
	// get the PVS for the pos to limit the number of checks
    if ( !visdatasize )
    {       
        memset (pvs, 255, (dvis->numclusters+7)/8 );
        lastoffset = -1;
    }
    else 
    {
		if (cluster < 0)
		{
			// Error, point embedded in wall
			// sampled[0][1] = 255;
			memset (pvs, 255, (dvis->numclusters+7)/8 );
			lastoffset = -1;
		}
		else
		{
			int thisoffset = dvis->bitofs[ cluster ][DVIS_PVS];
			if ( thisoffset != lastoffset )
			{ 
				if ( thisoffset == -1 )
				{
					Error ("visofs == -1");
				}

				DecompressVis (&dvisdata[thisoffset], pvs);
			}
			lastoffset = thisoffset;
		}
    }
	return lastoffset;
}


void BuildPatchLights( int facenum );

void DumpSamples( int ndxFace, facelight_t *pFaceLight )
{
	ThreadLock();

	dface_t *pFace = &g_pFaces[ndxFace];
	if( pFace )
	{
		bool bBumpped = ( ( texinfo[pFace->texinfo].flags & SURF_BUMPLIGHT ) != 0 );

		for( int iStyle = 0; iStyle < 4; ++iStyle )
		{
			if( pFace->styles[iStyle] != 255 )
			{
				for ( int iBump = 0; iBump < 4; ++iBump )
				{
					if ( iBump == 0 || ( iBump > 0 && bBumpped ) )
					{
						for( int iSample = 0; iSample < pFaceLight->numsamples; ++iSample )
						{
							sample_t *pSample = &pFaceLight->sample[iSample];
							WriteWinding( pFileSamples[iStyle][iBump], pSample->w, pFaceLight->light[iStyle][iBump][iSample].m_vecLighting );
							if( bDumpNormals )
							{
								WriteNormal( pFileSamples[iStyle][iBump], pSample->pos, pSample->normal, 15.0f, pSample->normal * 255.0f );
							}
						}
					}
				}
			}
		}
	}

	ThreadUnlock();
}


//-----------------------------------------------------------------------------
// Allocates light sample data
//-----------------------------------------------------------------------------
static inline void AllocateLightstyleSamples( facelight_t* fl, int styleIndex, int numnormals )
{
	for (int n = 0; n < numnormals; ++n)
	{
		fl->light[styleIndex][n] = ( LightingValue_t* )calloc( fl->numsamples, sizeof(LightingValue_t ) );
	}
}


//-----------------------------------------------------------------------------
// Used to find an existing lightstyle on a face
//-----------------------------------------------------------------------------
static inline int FindLightstyle( dface_t* f, int lightstyle )
{
 	for (int k = 0; k < MAXLIGHTMAPS; k++)
	{
		if (f->styles[k] == lightstyle)
			return k;
	}

	return -1;
}

static int FindOrAllocateLightstyleSamples( dface_t* f, facelight_t	*fl, int lightstyle, int numnormals )
{
	// Search the lightstyles associated with the face for a match
	int k;
 	for (k = 0; k < MAXLIGHTMAPS; k++)
	{
		if (f->styles[k] == lightstyle)
			break;

		// Found an empty entry, we can use it for a new lightstyle
		if (f->styles[k] == 255)
		{
			AllocateLightstyleSamples( fl, k, numnormals );
			f->styles[k] = lightstyle;
			break;
		}
	}

	// Check for overflow
	if (k >= MAXLIGHTMAPS)
		return -1;

	return k;
}


//-----------------------------------------------------------------------------
// Compute the illumination point + normal for the sample
//-----------------------------------------------------------------------------
static void ComputeIlluminationPointAndNormalsSSE( lightinfo_t const& l, FourVectors const &pos, FourVectors const &norm, SSE_SampleInfo_t* pInfo, int numSamples )
{
	Vector v[4];

	pInfo->m_Points = pos;
	bool computeNormals = ( pInfo->m_NormalCount > 1 && ( pInfo->m_IsDispFace || !l.isflat ) );

	// FIXME: move sample point off the surface a bit, this is done so that
	// light sampling will not be affected by a bug	where raycasts will
	// intersect with the face being lit. We really should just have that
	// logic in GatherSampleLight
	FourVectors faceNormal;
	faceNormal.DuplicateVector( l.facenormal );
	pInfo->m_Points += faceNormal;

	if ( pInfo->m_IsDispFace )
	{
		pInfo->m_PointNormals[0] = norm;
	}
	else if ( !l.isflat )
	{
		// If the face isn't flat, use a phong-based normal instead
		FourVectors modelorg;
		modelorg.DuplicateVector( l.modelorg );
		FourVectors vecSample = pos;
		vecSample -= modelorg;
		GetPhongNormal( pInfo->m_FaceNum, vecSample, pInfo->m_PointNormals[0] );
	}

	if ( computeNormals )
	{
		Vector bv[4][NUM_BUMP_VECTS];
		for ( int i = 0; i < 4; ++i )
		{
			// TODO: using Vec may slow things down a bit
			GetBumpNormals( pInfo->m_pTexInfo->textureVecsTexelsPerWorldUnits[0],
			                pInfo->m_pTexInfo->textureVecsTexelsPerWorldUnits[1],
			                l.facenormal, pInfo->m_PointNormals[0].Vec( i ), bv[i] );
		}
		for ( int b = 0; b < NUM_BUMP_VECTS; ++b )
		{
			pInfo->m_PointNormals[b+1].LoadAndSwizzle ( bv[0][b], bv[1][b], bv[2][b], bv[3][b] );
		}
	}

	// TODO: this may slow things down a bit ( using Vec )
	for ( int i = 0; i < 4; ++i )
		pInfo->m_Clusters[i] = ClusterFromPoint( pos.Vec( i ) );
}

//-----------------------------------------------------------------------------
// Compute the illumination point + normal for the sample on a displacement
// (see ComputeIlluminationPointAndNormalsSSE above)
//-----------------------------------------------------------------------------
static void ComputeIlluminationPointAndNormalsForDisp( lightinfo_t const& l, FourVectors &pos, FourVectors &norm, SSE_SampleInfo_t* pInfo )
{
	pInfo->m_PointNormals[ 0 ] = norm;

	if ( pInfo->m_NormalCount > 1 )
	{
		Vector bv[ 4 ][ NUM_BUMP_VECTS ];
		for ( int j = 0; j < 4; j++ )
		{
			// TODO: using Vec may slow things down a bit
			GetBumpNormals( pInfo->m_pTexInfo->textureVecsTexelsPerWorldUnits[ 0 ],
							pInfo->m_pTexInfo->textureVecsTexelsPerWorldUnits[ 1 ],
							l.facenormal, norm.Vec( j ), bv[ j ] );
		}
		for ( int b = 0; b < NUM_BUMP_VECTS; b++ )
		{
			pInfo->m_PointNormals[ b + 1 ].LoadAndSwizzle( bv[ 0 ][ b ], bv[ 1 ][ b ], bv[ 2 ][ b ], bv[ 3 ][ b ] );
		}
	}

	pInfo->m_Points = pos;

	// FIXME: move sample point off the surface a bit, this is done so that
	// light sampling will not be affected by a bug	where raycasts will
	// intersect with the face being lit. We really should just have that
	// logic in GatherSampleLight
	FourVectors faceNormal = norm;
	pInfo->m_Points += faceNormal;

	// TODO: this may slow things down a bit ( using Vec )
	for ( int j = 0; j < 4; j++ )
		pInfo->m_Clusters[ j ] = ClusterFromPoint( pos.Vec( j ) );
}

//-----------------------------------------------------------------------------
// Iterates over all lights and computes lighting at up to 4 sample points
//-----------------------------------------------------------------------------
static void GatherSampleLightAt4Points( SSE_SampleInfo_t& info, int sampleIdx, int numSamples )
{
	SSE_sampleLightOutput_t out;

	// Iterate over all direct lights and add them to the particular sample
	for (directlight_t *dl = activelights; dl != NULL; dl = dl->next)
	{	    
		// is this lights cluster visible?
		fltx4 dotMask = Four_Zeros;
		bool skipLight = true;
		for( int s = 0; s < numSamples; s++ )
		{
			if( PVSCheck( dl->pvs, info.m_Clusters[s] ) )
			{
				dotMask = SetComponentSIMD( dotMask, s, 1.0f );
				skipLight = false;
			}
		}
		if ( skipLight )
			continue;

		GatherSampleLightSSE( out, dl, info.m_FaceNum, info.m_Points, info.m_PointNormals, info.m_NormalCount, info.m_iThread );
		
		// Apply the PVS check filter and compute falloff x dot
		fltx4 fxdot[NUM_BUMP_VECTS + 1];
		skipLight = true;
		for ( int b = 0; b < info.m_NormalCount; b++ )
		{
			fxdot[b] = MulSIMD( out.m_flDot[b], dotMask );
			fxdot[b] = MulSIMD( fxdot[b], out.m_flFalloff );
			if ( !IsAllZeros( fxdot[b] ) )
			{
				skipLight = false;
			}
		}
		if ( skipLight )
			continue;

		// Figure out the lightstyle for this particular sample
		int lightStyleIndex = FindOrAllocateLightstyleSamples( info.m_pFace, info.m_pFaceLight, 
			dl->light.style, info.m_NormalCount );
		if (lightStyleIndex < 0)
		{
			if (info.m_WarnFace != info.m_FaceNum)
			{
				Warning ("\nWARNING: Too many light styles on a face at (%f, %f, %f)\n",
					info.m_Points.x.m128_f32[0], info.m_Points.y.m128_f32[0], info.m_Points.z.m128_f32[0] );
				info.m_WarnFace = info.m_FaceNum;
			}
			continue;
		}

		// pLightmaps is an array of the lightmaps for each normal direction,
		// here's where the result of the sample gathering goes
		LightingValue_t** pLightmaps = info.m_pFaceLight->light[lightStyleIndex];

		// Incremental lighting only cares about lightstyle zero
		if( g_pIncremental && (dl->light.style == 0) )
		{
			for ( int i = 0; i < numSamples; i++ )
			{
				g_pIncremental->AddLightToFace( dl->m_IncrementalID, info.m_FaceNum, sampleIdx + i, 
					info.m_LightmapSize, SubFloat( fxdot[0], i ), info.m_iThread );
			}
		}

		for( int n = 0; n < info.m_NormalCount; ++n )
		{
			for ( int i = 0; i < numSamples; i++ )
			{
				pLightmaps[n][sampleIdx + i].AddLight( SubFloat( fxdot[n], i ), dl->light.intensity, SubFloat( out.m_flSunAmount[n], i ) );
			}
		}
	}
}


//-----------------------------------------------------------------------------
// Iterates over all lights and computes lighting at a sample point
//-----------------------------------------------------------------------------
static void ResampleLightAt4Points( SSE_SampleInfo_t& info, int lightStyleIndex, int flags, LightingValue_t pLightmap[4][NUM_BUMP_VECTS+1] )
{
	SSE_sampleLightOutput_t out;

	// Clear result
	for ( int i = 0; i < 4; ++i )
	{
		for ( int n = 0; n < info.m_NormalCount; ++n )
		{
			pLightmap[i][n].Zero();
		}
	}

	// Iterate over all direct lights and add them to the particular sample
	for (directlight_t *dl = activelights; dl != NULL; dl = dl->next)
	{
		if ((flags & AMBIENT_ONLY) && (dl->light.type != emit_skyambient))
			continue;

		if ((flags & NON_AMBIENT_ONLY) && (dl->light.type == emit_skyambient))
			continue;

		// Only add contributions that match the lightstyle 
		Assert( lightStyleIndex <= MAXLIGHTMAPS );
		Assert( info.m_pFace->styles[lightStyleIndex] != 255 );
		if (dl->light.style != info.m_pFace->styles[lightStyleIndex])
			continue;

		// is this lights cluster visible?
		fltx4 dotMask = Four_Zeros;
		bool skipLight = true;
		for( int s = 0; s < 4; s++ )
		{
			if( PVSCheck( dl->pvs, info.m_Clusters[s] ) )
			{
				dotMask = SetComponentSIMD( dotMask, s, 1.0f );
				skipLight = false;
			}
		}
		if ( skipLight )
			continue;

		// NOTE: Notice here that if the light is on the back side of the face
		// (tested by checking the dot product of the face normal and the light position)
		// we don't want it to contribute to *any* of the bumped lightmaps. It glows
		// in disturbing ways if we don't do this.
		GatherSampleLightSSE( out, dl, info.m_FaceNum, info.m_Points, info.m_PointNormals, info.m_NormalCount, info.m_iThread );

		// Apply the PVS check filter and compute falloff x dot
		fltx4 fxdot[NUM_BUMP_VECTS + 1];
		for ( int b = 0; b < info.m_NormalCount; b++ )
		{
			fxdot[b] = MulSIMD( out.m_flFalloff, out.m_flDot[b] );
			fxdot[b] = MulSIMD( fxdot[b], dotMask );
		}

		// Compute the contributions to each of the bumped lightmaps
		// The first sample is for non-bumped lighting.
		// The other sample are for bumpmapping.
		for( int i = 0; i < 4; ++i )
		{
			for( int n = 0; n < info.m_NormalCount; ++n )
			{
				pLightmap[i][n].AddLight( SubFloat( fxdot[n], i ), dl->light.intensity, SubFloat( out.m_flSunAmount[n], i ) );
			}
		}
	}
}

bool PointsInWinding( FourVectors const & point, winding_t *w, int &invalidBits )
{
	FourVectors edge, toPt, cross, testCross, p0, p1;
	fltx4 invalidMask = Four_Zeros;

	//
	// get the first normal to test
	//
	p0.DuplicateVector( w->p[0] );
	p1.DuplicateVector( w->p[1] );
	toPt = point;
	toPt -= p0;
	edge = p1;
	edge -= p0;
	testCross = edge ^ toPt;
	// safer against /0 - testCross.VectorNormalizeFast();
	fltx4 mag_sq = testCross * testCross;
	testCross *= ReciprocalSqrtEstSaturateSIMD( mag_sq );

	for ( int ndxPt = 1; ndxPt < w->numpoints; ndxPt++ )
	{
		p0.DuplicateVector( w->p[ndxPt] );
		p1.DuplicateVector( w->p[(ndxPt+1)%w->numpoints] );
		toPt = point;
		toPt -= p0;
		edge = p1;
		edge -= p0;
		cross = edge ^ toPt;
		// safer against /0 - cross.VectorNormalizeFast();
		mag_sq = cross * cross;
		cross *= ReciprocalSqrtEstSaturateSIMD( mag_sq );

		fltx4 dot = cross * testCross;
		invalidMask = OrSIMD( invalidMask, CmpLtSIMD( dot, Four_Zeros ) );

		invalidBits = TestSignSIMD ( invalidMask );
		if ( invalidBits == 0xF )
			return false;
	}

	return true;
}

//-----------------------------------------------------------------------------
// Perform supersampling at a particular point
//-----------------------------------------------------------------------------
static int SupersampleLightAtPoint( lightinfo_t& l, SSE_SampleInfo_t& info, 
									int sampleIndex, int lightStyleIndex, LightingValue_t *pLight, int flags )
{
	sample_t& sample = info.m_pFaceLight->sample[sampleIndex];

	// Get the position of the original sample in lightmapspace
	Vector2D temp;
	WorldToLuxelSpace( &l, sample.pos, temp );
	Vector sampleLightOrigin( temp[0], temp[1], 0.0f );

	// Some parameters related to supersampling
	float sampleWidth = ( flags & NON_AMBIENT_ONLY ) ? 4 : 2;

	float cscale = 1.0f / sampleWidth;
	float csshift = -( ( sampleWidth - 1 ) * cscale ) / 2.0;

	// Clear out the light values
	for (int i = 0; i < info.m_NormalCount; ++i )
		pLight[i].Zero();

	int subsampleCount = 0;

	FourVectors superSampleNormal;
	superSampleNormal.DuplicateVector( sample.normal );

	FourVectors superSampleLightCoord;
	FourVectors superSamplePosition;
	superSamplePosition.DuplicateVector( sample.pos );

	Vector wsError;
	FourVectors superSampleWorldSpaceError;

	float stepU = 0.0f;
	float stepV = 0.0f;
	if ( info.m_IsDispFace )
	{
		// compensate for error when transforming back to worldspace (only enabled for displacements)
		Vector toWorld;
		LuxelSpaceToWorld( &l, temp.x, temp.y, toWorld );
		VectorSubtract( sample.pos, toWorld, wsError );
		superSampleWorldSpaceError.DuplicateVector( wsError );

		// lightmap size
		int width = l.face->m_LightmapTextureSizeInLuxels[ 0 ] + 1;
		int height = l.face->m_LightmapTextureSizeInLuxels[ 1 ] + 1;

		// calculate the steps in uv space
		stepU = 1.0f / (float)width;
		stepV = 1.0f / (float)height;
	}

	if ( flags & NON_AMBIENT_ONLY )
	{
		float aRow[4];
		for ( int coord = 0; coord < 4; ++coord )
			aRow[ coord ] = csshift + coord * cscale;
		fltx4 sseRow = LoadUnalignedSIMD( aRow );

		for (int s = 0; s < 4; ++s)
		{
			// make sure the coordinate is inside of the sample's winding and when normalizing
			// below use the number of samples used, not just numsamples and some of them
			// will be skipped if they are not inside of the winding
			superSampleLightCoord.DuplicateVector( sampleLightOrigin );
			superSampleLightCoord.x = AddSIMD( superSampleLightCoord.x, ReplicateX4( aRow[s] ) );
			superSampleLightCoord.y = AddSIMD( superSampleLightCoord.y, sseRow );

			// Figure out where the supersample exists in the world, and make sure
			// it lies within the sample winding
			LuxelSpaceToWorld( &l, superSampleLightCoord[0], superSampleLightCoord[1], superSamplePosition );
 
			if ( info.m_IsDispFace )
			{
				// Fix up error from world to luxel and back again
				superSamplePosition += superSampleWorldSpaceError;

				// Find pos and norm for disp from uv supersample offsets
 				Vector vDispP[4], vDispN[4];
  				for ( int i = 0; i < 4; i++ )
 				{
 					vDispP[ i ] = superSamplePosition.Vec( i );
					Vector2D uv;

					uv.x = sample.coord[0] + ( aRow[ s ] * stepU );
					uv.y = sample.coord[1] + ( aRow[ i ] * stepV );

					StaticDispMgr()->GetDispSurfPointAndNormalFromUV( info.m_FaceNum, vDispP[ i ], vDispN[ i ], uv, false );
				}
				superSamplePosition = FourVectors( vDispP[ 0 ], vDispP[ 1 ], vDispP[ 2 ], vDispP[ 3 ] );
				superSampleNormal = FourVectors( vDispN[ 0 ], vDispN[ 1 ], vDispN[ 2 ], vDispN[ 3 ] );

				ComputeIlluminationPointAndNormalsForDisp( l, superSamplePosition, superSampleNormal, &info );
			}

			// A winding should exist only if the sample wasn't a uniform luxel, or if g_bDumpPatches is true.
			int invalidBits = 0;
			if ( sample.w && !PointsInWinding( superSamplePosition, sample.w, invalidBits ) )
				continue;

			// Compute the super-sample illumination point and normal
			// We're assuming the flat normal is the same for all supersamples
			if ( !info.m_IsDispFace )
				ComputeIlluminationPointAndNormalsSSE( l, superSamplePosition, superSampleNormal, &info, 4 );

			// Resample the non-ambient light at this point...
			LightingValue_t result[4][NUM_BUMP_VECTS+1];
			ResampleLightAt4Points( info, lightStyleIndex, NON_AMBIENT_ONLY, result );

			// Got more subsamples
			for ( int i = 0; i < 4; i++ )
			{
				if ( !( ( invalidBits >> i ) & 0x1 ) )
				{
					for ( int n = 0; n < info.m_NormalCount; ++n )
					{
						pLight[ n ].AddLight( result[ i ][ n ] );
					}
					++subsampleCount;
				}
			}
		}
	}
	else
	{
		FourVectors superSampleOffsets;
		superSampleOffsets.LoadAndSwizzle( Vector( csshift, csshift, 0 ), Vector( csshift, csshift + cscale, 0),
		                                   Vector( csshift + cscale, csshift, 0 ), Vector( csshift + cscale, csshift + cscale, 0 ) );
		superSampleLightCoord.DuplicateVector( sampleLightOrigin );
		superSampleLightCoord += superSampleOffsets;

		LuxelSpaceToWorld( &l, superSampleLightCoord[0], superSampleLightCoord[1], superSamplePosition );

		if ( info.m_IsDispFace )
		{
			// Fix up error from world to luxel and back again
			superSamplePosition += superSampleWorldSpaceError;

			// Find pos and norm for disp from uv supersample offsets
			Vector vDispP[ 4 ], vDispN[ 4 ];
			for ( int i = 0; i < 4; i++ )
			{
				vDispP[ i ] = superSamplePosition.Vec( i );
				Vector uvOffsets = superSampleOffsets.Vec( i );
				Vector2D uv;

				uv.x = sample.coord[ 0 ] + ( uvOffsets.x * stepU );
				uv.y = sample.coord[ 1 ] + ( uvOffsets.y * stepV );

				StaticDispMgr()->GetDispSurfPointAndNormalFromUV( info.m_FaceNum, vDispP[ i ], vDispN[ i ], uv, false );
			}
			superSamplePosition = FourVectors( vDispP[ 0 ], vDispP[ 1 ], vDispP[ 2 ], vDispP[ 3 ] );
			superSampleNormal = FourVectors( vDispN[ 0 ], vDispN[ 1 ], vDispN[ 2 ], vDispN[ 3 ] );

			ComputeIlluminationPointAndNormalsForDisp( l, superSamplePosition, superSampleNormal, &info );
		}

		int invalidBits = 0;
		if ( sample.w && !PointsInWinding( superSamplePosition, sample.w, invalidBits ) )
			return 0;

		if ( !info.m_IsDispFace )
			ComputeIlluminationPointAndNormalsSSE( l, superSamplePosition, superSampleNormal, &info, 4 );

		LightingValue_t result[4][NUM_BUMP_VECTS+1];
		ResampleLightAt4Points( info, lightStyleIndex, AMBIENT_ONLY, result );

		// Got more subsamples
		for ( int i = 0; i < 4; i++ )
		{
			if ( !( ( invalidBits >> i ) & 0x1 ) )
			{
				for ( int n = 0; n < info.m_NormalCount; ++n )
				{
					pLight[ n ].AddLight( result[ i ][ n ] );
				}
				++subsampleCount;
			}
		}
	}

	return subsampleCount;
}


//-----------------------------------------------------------------------------
// Compute gradients of a lightmap
//-----------------------------------------------------------------------------
static void ComputeLightmapGradients( SSE_SampleInfo_t& info, bool const* pHasProcessedSample, 
									  float* pIntensity, float* gradient )
{
	int w = info.m_LightmapWidth;
	int h = info.m_LightmapHeight;
	facelight_t* fl = info.m_pFaceLight;

	for (int i=0 ; i<fl->numsamples ; i++)
	{
		// Don't supersample the same sample twice
		if (pHasProcessedSample[i])
			continue;

		gradient[i] = 0.0f;
		sample_t& sample = fl->sample[i];

		// Choose the maximum gradient of all bumped lightmap intensities
		for ( int n = 0; n < info.m_NormalCount; ++n )
		{
			int j = n * info.m_LightmapSize + sample.s + sample.t * w;

			if (sample.t > 0)
			{
				if (sample.s > 0)   gradient[i] = max( gradient[i], fabs( pIntensity[j] - pIntensity[j-1-w] ) );
				gradient[i] = max( gradient[i], fabs( pIntensity[j] - pIntensity[j-w] ) );
				if (sample.s < w-1) gradient[i] = max( gradient[i], fabs( pIntensity[j] - pIntensity[j+1-w] ) );
			}
			if (sample.t < h-1)
			{
				if (sample.s > 0)   gradient[i] = max( gradient[i], fabs( pIntensity[j] - pIntensity[j-1+w] ) );
				gradient[i] = max( gradient[i], fabs( pIntensity[j] - pIntensity[j+w] ) );
				if (sample.s < w-1) gradient[i] = max( gradient[i], fabs( pIntensity[j] - pIntensity[j+1+w] ) );
			}
			if (sample.s > 0)   gradient[i] = max( gradient[i], fabs( pIntensity[j] - pIntensity[j-1] ) );
			if (sample.s < w-1) gradient[i] = max( gradient[i], fabs( pIntensity[j] - pIntensity[j+1] ) );
		}
	}
}

//-----------------------------------------------------------------------------
// ComputeLuxelIntensity...
//-----------------------------------------------------------------------------
static inline void ComputeLuxelIntensity( SSE_SampleInfo_t& info, int sampleIdx, 
										  LightingValue_t **ppLightSamples, float* pSampleIntensity )
{
	// Compute a separate intensity for each
	sample_t& sample = info.m_pFaceLight->sample[sampleIdx];
	int destIdx = sample.s + sample.t * info.m_LightmapWidth;
	for (int n = 0; n < info.m_NormalCount; ++n)
	{
		float intensity = ppLightSamples[n][sampleIdx].Intensity();

		// convert to a linear perception space
		pSampleIntensity[n * info.m_LightmapSize + destIdx] = pow( intensity / 256.0, 1.0 / 2.2 );
	}
}

//-----------------------------------------------------------------------------
// Compute the maximum intensity based on all bumped lighting
//-----------------------------------------------------------------------------
static void ComputeSampleIntensities( SSE_SampleInfo_t& info, LightingValue_t **ppLightSamples, float* pSampleIntensity )
{
	for (int i=0; i<info.m_pFaceLight->numsamples; i++)
	{
		ComputeLuxelIntensity( info, i, ppLightSamples, pSampleIntensity );
	}
}

//-----------------------------------------------------------------------------
// Perform supersampling on a particular lightstyle
//-----------------------------------------------------------------------------
static void BuildSupersampleFaceLights( lightinfo_t& l, SSE_SampleInfo_t& info, int lightstyleIndex )
{
	LightingValue_t pAmbientLight[NUM_BUMP_VECTS+1];
	LightingValue_t pDirectLight[NUM_BUMP_VECTS+1];

	// This is used to make sure we don't supersample a light sample more than once
	int processedSampleSize = info.m_LightmapSize * sizeof(bool);
	bool* pHasProcessedSample = (bool*)stackalloc( processedSampleSize );
	memset( pHasProcessedSample, 0, processedSampleSize );

	// This is used to compute a simple gradient computation of the light samples
	// We're going to store the maximum intensity of all bumped samples at each sample location
	float* pGradient = (float*)stackalloc( info.m_pFaceLight->numsamples * sizeof(float) );
	float* pSampleIntensity = (float*)stackalloc( info.m_NormalCount * info.m_LightmapSize * sizeof(float) );

	// Compute the maximum intensity of all lighting associated with this lightstyle
	// for all bumped lighting
	LightingValue_t **ppLightSamples = info.m_pFaceLight->light[lightstyleIndex];
	ComputeSampleIntensities( info, ppLightSamples, pSampleIntensity );

	Vector *pVisualizePass = NULL;
	if (debug_extra)
	{
		int visualizationSize = info.m_pFaceLight->numsamples * sizeof(Vector);
		pVisualizePass = (Vector*)stackalloc( visualizationSize );
		memset( pVisualizePass, 0, visualizationSize ); 
	}

	// What's going on here is that we're looking for large lighting discontinuities
	// (large light intensity gradients) as a clue that we should probably be supersampling
	// in that area. Because the supersampling operation will cause lighting changes,
	// we've found that it's good to re-check the gradients again and see if any other
	// areas should be supersampled as a result of the previous pass. Keep going
	// until all the gradients are reasonable or until we hit a max number of passes
	bool do_anotherpass = true;
	int pass = 1;
	while (do_anotherpass && pass <= extrapasses)
	{
		// Look for lighting discontinuities to see what we should be supersampling
		ComputeLightmapGradients( info, pHasProcessedSample, pSampleIntensity, pGradient );

		do_anotherpass = false;

		// Now check all of the samples and supersample those which we have
		// marked as having high gradients
		for (int i=0 ; i<info.m_pFaceLight->numsamples; ++i)
		{
			// Don't supersample the same sample twice
			if (pHasProcessedSample[i])
				continue;

			// Don't supersample if the lighting is pretty uniform near the sample
			if (pGradient[i] < 0.0625)
				continue;

			// Joy! We're supersampling now, and we therefore must do another pass
			// Also, we need never bother with this sample again
			pHasProcessedSample[i] = true;
			do_anotherpass = true;

			if (debug_extra)
			{
				// Mark the little visualization bitmap with a color indicating
				// which pass it was updated on.
				pVisualizePass[i][0] = (pass & 1) * 255;
				pVisualizePass[i][1] = (pass & 2) * 128;
				pVisualizePass[i][2] = (pass & 4) * 64;
			}

			// Supersample the ambient light for each bump direction vector
			int ambientSupersampleCount = SupersampleLightAtPoint( l, info, i, lightstyleIndex, pAmbientLight, AMBIENT_ONLY );

			// Supersample the non-ambient light for each bump direction vector
			int directSupersampleCount = SupersampleLightAtPoint( l, info, i, lightstyleIndex, pDirectLight, NON_AMBIENT_ONLY );

			// Because of sampling problems, small area triangles may have no samples.
			// In this case, just use what we already have
			if ( ambientSupersampleCount > 0 && directSupersampleCount > 0 )
			{
				// Add the ambient + directional terms together, stick it back into the lightmap
				for ( int n = 0; n < info.m_NormalCount; ++n )
				{
					ppLightSamples[ n ][ i ].Zero();
					ppLightSamples[ n ][ i ].AddWeighted( pDirectLight[ n ], 1.0f / directSupersampleCount );
					ppLightSamples[ n ][ i ].AddWeighted( pAmbientLight[ n ], 1.0f / ambientSupersampleCount );
				}

				// Recompute the luxel intensity based on the supersampling
				ComputeLuxelIntensity( info, i, ppLightSamples, pSampleIntensity );
			}
		}

		// We've finished another pass
		pass++;
	}

	if (debug_extra)
	{
		// Copy colors representing which supersample pass the sample was messed with
		// into the actual lighting values so we can visualize it
		for (int i=0 ; i<info.m_pFaceLight->numsamples ; ++i)
		{
			for (int j = 0; j <info.m_NormalCount; ++j)
			{
				VectorCopy( pVisualizePass[i], ppLightSamples[j][i].m_vecLighting ); 
			}
		}
	}
}

void InitLightinfo( lightinfo_t *pl, int facenum )
{
	dface_t		*f;

	f = &g_pFaces[facenum];

	memset (pl, 0, sizeof(*pl));
	pl->facenum = facenum;

    pl->face = f;

    //
    // rotate plane 
    //
	VectorCopy (dplanes[f->planenum].normal, pl->facenormal);
	pl->facedist = dplanes[f->planenum].dist;

	// get the origin offset for rotating bmodels
	VectorCopy (face_offset[facenum], pl->modelorg);

	CalcFaceVectors( pl );

	// figure out if the surface is flat
	pl->isflat = true;
	if (smoothing_threshold != 1)
	{
		faceneighbor_t *fn = &faceneighbor[facenum];

		for (int j=0 ; j<f->numedges ; j++)
		{
			float dot = DotProduct( pl->facenormal, fn->normal[j] );
			if (dot < 1.0 - EQUAL_EPSILON)
			{
				pl->isflat = false;
				break;
			}
		}
	}
}

static void InitSampleInfo( lightinfo_t const& l, int iThread, SSE_SampleInfo_t& info )
{
	info.m_LightmapWidth  = l.face->m_LightmapTextureSizeInLuxels[0]+1;
	info.m_LightmapHeight = l.face->m_LightmapTextureSizeInLuxels[1]+1;
	info.m_LightmapSize = info.m_LightmapWidth * info.m_LightmapHeight;

	// How many lightmaps are we going to need?
	info.m_pTexInfo = &texinfo[l.face->texinfo];
	info.m_NormalCount = (info.m_pTexInfo->flags & SURF_BUMPLIGHT) ? NUM_BUMP_VECTS + 1 : 1;
	info.m_FaceNum = l.facenum;
	info.m_pFace = l.face;
	info.m_pFaceLight = &facelight[info.m_FaceNum];
	info.m_IsDispFace = ValidDispFace( info.m_pFace );
	info.m_iThread = iThread;
	info.m_WarnFace = -1;

	info.m_NumSamples = info.m_pFaceLight->numsamples;
	info.m_NumSampleGroups = ( info.m_NumSamples & 0x3) ? ( info.m_NumSamples / 4 ) + 1 : ( info.m_NumSamples / 4 );

	// initialize normals if the surface is flat
	if (l.isflat)
	{
		info.m_PointNormals[0].DuplicateVector( l.facenormal );

		// use facenormal along with the smooth normal to build the three bump map vectors
		if( info.m_NormalCount > 1 )
		{
			Vector bumpVects[NUM_BUMP_VECTS];
			GetBumpNormals( info.m_pTexInfo->textureVecsTexelsPerWorldUnits[0], 
				info.m_pTexInfo->textureVecsTexelsPerWorldUnits[1], l.facenormal, 
				l.facenormal, bumpVects );//&info.m_PointNormal[1] );

			for ( int b = 0; b < NUM_BUMP_VECTS; ++b )
			{
				info.m_PointNormals[b + 1].DuplicateVector( bumpVects[b] );
			}
		}
	}
}

void BuildFacelights (int iThread, int facenum)
{
	int	i, j;

	lightinfo_t	l;
	dface_t *f;
	facelight_t	*fl;
	SSE_SampleInfo_t sampleInfo;
	directlight_t *dl;
	Vector spot;
	Vector v[4], n[4];

	if( g_bInterrupt )
		return;

	// FIXME: Is there a better way to do this? Like, in RunThreadsOn, for instance?
	// Don't pay this cost unless we have to; this is super perf-critical code.
	if (g_pIncremental)
	{
		// Both threads will be accessing this so it needs to be protected or else thread A
		// will load it in and thread B will increment it but its increment will be
		// overwritten by thread A when thread A writes it back.
		ThreadLock();
		++g_iCurFace;
		ThreadUnlock();
	}

	// some surfaces don't need lightmaps
	f = &g_pFaces[facenum];
	f->lightofs = -1;
	for (j=0 ; j<MAXLIGHTMAPS ; j++)
		f->styles[j] = 255;

	// Trivial-reject the whole face?	
	if( !( g_FacesVisibleToLights[facenum>>3] & (1 << (facenum & 7)) ) )
		return;

	if ( texinfo[f->texinfo].flags & TEX_SPECIAL)
		return;		// non-lit texture

	// check for patches for this face.  If none it must be degenerate.  Ignore.
	if( g_FacePatches.Element( facenum ) == g_FacePatches.InvalidIndex() )
		return;

	fl = &facelight[facenum];

	InitLightinfo( &l, facenum );
	CalcPoints( &l, fl, facenum );
	InitSampleInfo( l, iThread, sampleInfo );

	// Allocate sample positions/normals to SSE
	int numGroups = ( fl->numsamples & 0x3) ? ( fl->numsamples / 4 ) + 1 : ( fl->numsamples / 4 );

	// always allocate style 0 lightmap
	f->styles[0] = 0;
	AllocateLightstyleSamples( fl, 0, sampleInfo.m_NormalCount );

	// sample the lights at each sample location
	for ( int grp = 0; grp < numGroups; ++grp )
	{
		int nSample = 4 * grp;

		sample_t *sample = sampleInfo.m_pFaceLight->sample + nSample;
		int numSamples = min ( 4, sampleInfo.m_pFaceLight->numsamples - nSample );

		FourVectors positions;
		FourVectors normals;

		for ( i = 0; i < 4; i++ )
		{
			v[i] = ( i < numSamples ) ? sample[i].pos : sample[numSamples - 1].pos;
			n[i] = ( i < numSamples ) ? sample[i].normal : sample[numSamples - 1].normal;
		}
		positions.LoadAndSwizzle( v[0], v[1], v[2], v[3] );
		normals.LoadAndSwizzle( n[0], n[1], n[2], n[3] );

		ComputeIlluminationPointAndNormalsSSE( l, positions, normals, &sampleInfo, numSamples );

		// Fixup sample normals in case of smooth faces
		if ( !l.isflat )
		{
			for ( i = 0; i < numSamples; i++ )
				sample[i].normal = sampleInfo.m_PointNormals[0].Vec( i );
		}

		// Iterate over all the lights and add their contribution to this group of spots
		GatherSampleLightAt4Points( sampleInfo, nSample, numSamples );
	}
	
	// Tell the incremental light manager that we're done with this face.
	if( g_pIncremental )
	{
		for (dl = activelights; dl != NULL; dl = dl->next)
		{
			// Only deal with lightstyle 0 for incremental lighting
			if (dl->light.style == 0)
				g_pIncremental->FinishFace( dl->m_IncrementalID, facenum, iThread );
		}

		// Don't have to deal with patch lights (only direct lighting is used)
		// or supersampling
		return;
	}

	// Enabling supersampling for displacements (previous revision always disabled do_extra for disp)
	// improves continuity significantly between disp and brush surfaces, especially when using high frequency alpha shadow materials
	if ( do_extra ) 
	{
		// For each lightstyle, perform a supersampling pass
		for ( i = 0; i < MAXLIGHTMAPS; ++i )
		{
			// Stop when we run out of lightstyles
			if (f->styles[i] == 255)
				break;

			BuildSupersampleFaceLights( l, sampleInfo, i );
		}
	}

	if (!g_bUseMPI) 
	{
		//
		// This is done on the master node when MPI is used
		//
		BuildPatchLights( facenum );
	}

	if( g_bDumpPatches )
	{
		DumpSamples( facenum, fl );
	}
	else
	{
		FreeSampleWindings( fl );
	}

}

void BuildPatchLights( int facenum )
{
	int i, k;

	CPatch		*patch;

	dface_t	*f = &g_pFaces[facenum];
	facelight_t	*fl = &facelight[facenum];

	for( k = 0; k < MAXLIGHTMAPS; k++ )
	{
		if (f->styles[k] == 0)
			break;
	}

	if (k >= MAXLIGHTMAPS)
		return;

	for (i = 0; i < fl->numsamples; i++)
	{
		AddSampleToPatch( &fl->sample[i], fl->light[k][0][i], facenum);
	}

	// check for a valid face
	if( g_FacePatches.Element( facenum ) == g_FacePatches.InvalidIndex() )
		return;

	// push up sampled light to parents (children always exist first in the list)
	CPatch *pNextPatch;
	for( patch = &g_Patches.Element( g_FacePatches.Element( facenum ) ); patch; patch = pNextPatch )
	{
		// next patch
		pNextPatch = NULL;
		if( patch->ndxNext != g_Patches.InvalidIndex() )
		{
			pNextPatch = &g_Patches.Element( patch->ndxNext );
		}

		// skip patches without parents
		if( patch->parent == g_Patches.InvalidIndex() )
//		if (patch->parent == -1)
			continue;

		CPatch *parent = &g_Patches.Element( patch->parent );

		parent->samplearea += patch->samplearea;
		VectorAdd( parent->samplelight, patch->samplelight, parent->samplelight );
	}

	// average up the direct light on each patch for radiosity
	if (numbounce > 0)
	{
		for( patch = &g_Patches.Element( g_FacePatches.Element( facenum ) ); patch; patch = pNextPatch )
		{
			// next patch
			pNextPatch = NULL;
			if( patch->ndxNext != g_Patches.InvalidIndex() )
			{
				pNextPatch = &g_Patches.Element( patch->ndxNext );
			}

			if (patch->samplearea)
			{ 
				float scale;
				Vector v;
				scale = 1.0 / patch->samplearea;

				VectorScale( patch->samplelight, scale, v );
				VectorAdd( patch->totallight.light[0], v, patch->totallight.light[0] );
				VectorAdd( patch->directlight, v, patch->directlight );
			}
		}
	}

	// pull totallight from children (children always exist first in the list)
	for( patch = &g_Patches.Element( g_FacePatches.Element( facenum ) ); patch; patch = pNextPatch )
	{
		// next patch
		pNextPatch = NULL;
		if( patch->ndxNext != g_Patches.InvalidIndex() )
		{
			pNextPatch = &g_Patches.Element( patch->ndxNext );
		}

		if ( patch->child1 != g_Patches.InvalidIndex() )
		{
			float s1, s2;
			CPatch *child1;
			CPatch *child2;

			child1 = &g_Patches.Element( patch->child1 );
			child2 = &g_Patches.Element( patch->child2 );

			s1 = child1->area / (child1->area + child2->area);
			s2 = child2->area / (child1->area + child2->area);

			VectorScale( child1->totallight.light[0], s1, patch->totallight.light[0] );
			VectorMA( patch->totallight.light[0], s2, child2->totallight.light[0], patch->totallight.light[0] );

			VectorCopy( patch->totallight.light[0], patch->directlight );
		}
	}

	bool needsBumpmap = false;
	if( texinfo[f->texinfo].flags & SURF_BUMPLIGHT )
	{
		needsBumpmap = true;
	}

	// add an ambient term if desired
	if (ambient[0] || ambient[1] || ambient[2])
	{
		for( int j=0; j < MAXLIGHTMAPS && f->styles[j] != 255; j++ )
		{
			if ( f->styles[j] == 0 )
			{
				for (i = 0; i < fl->numsamples; i++)
				{
					fl->light[j][0][i].m_vecLighting += ambient;
					if( needsBumpmap )
					{
						fl->light[j][1][i].m_vecLighting += ambient;
						fl->light[j][2][i].m_vecLighting += ambient;
						fl->light[j][3][i].m_vecLighting += ambient;
					}
				}
				break;
			}
		}
	}

	// light from dlight_threshold and above is sent out, but the
	// texture itself should still be full bright

#if 0
	// if( VectorAvg( g_FacePatches[facenum]->baselight ) >= dlight_threshold)	// Now all lighted surfaces glow
 {
	 for( j=0; j < MAXLIGHTMAPS && f->styles[j] != 255; j++ )
	 {
		 if ( f->styles[j] == 0 )
		 {
			 // BUG: shouldn't this be done for all patches on the face?
			 for (i=0 ; i<fl->numsamples ; i++)
			 {
				 // garymctchange
				 VectorAdd( fl->light[j][0][i], g_FacePatches[facenum]->baselight, fl->light[j][0][i] ); 
				 if( needsBumpmap )
				 {
					 for( bumpSample = 1; bumpSample < NUM_BUMP_VECTS + 1; bumpSample++ )
					 {
						 VectorAdd( fl->light[j][bumpSample][i], g_FacePatches[facenum]->baselight, fl->light[j][bumpSample][i] ); 
					 }
				 }
			 }
			 break;
		 }
	 }
 }
#endif
}


void BuildStaticPropPatchlights( int iThread, int nPatch )
{
	if ( g_Patches[ nPatch ].faceNumber >= 0 )
	{
		// Not a static prop patch
		return;
	}
	CPatch &patch = g_Patches[ nPatch ];

	// Random sample locations
	Vector vecOrigin = patch.winding->p[ 0 ];
	Vector vecU = patch.winding->p[ 2 ] - patch.winding->p[ 0 ];
	Vector vecV = patch.winding->p[ 1 ] - patch.winding->p[ 0 ];
	int nSampleCount = Max( 1, int( patch.area / 16.0f ) );
	float flSampleArea = patch.area / float( nSampleCount );
	float flSampleFrac = 1.0f / float( nSampleCount );
	sample_t *pSamples = new sample_t[ nSampleCount ];
	memset( pSamples, 0, sizeof( sample_t )*nSampleCount );
	for ( int i = 0; i < nSampleCount; i++ )
	{
		// Shitty. Should be jittered instead or some other better distribution over the triangle.
		float flU = RandomFloat();
		float flV = RandomFloat();
		if ( flU + flV > 1.0f )
		{
			flU = 1.0f - flU;
			flV = 1.0f - flV;
			Swap( flU, flV );
		}
		pSamples[ i ].pos = vecOrigin + flU * vecU + flV * vecV;
		pSamples[ i ].normal = patch.normal;
		pSamples[ i ].area = flSampleArea;
	}

	Vector directColor( 0.0f, 0.0f, 0.0f );
	float flSunAmount = 0.0f;

	// sample the lights at each sample location
	for ( int i = 0; i < nSampleCount; i++ )
	{
		sample_t *sample = pSamples + i;

		directColor.Init( 0.0f, 0.0f, 0.0f );
		flSunAmount = 0.0f;
		ComputeDirectLightingAtPoint( sample->pos, &sample->normal, &directColor, &flSunAmount, 1, false, iThread, -1, 0 );
		directColor *= g_flStaticPropBounceBoost;
		patch.totallight.light[ 0 ] += directColor * flSampleFrac;
		patch.directlight += directColor * flSampleFrac;
	}

	delete[] pSamples;
}

/*
  =============
  PrecompLightmapOffsets
  =============
*/

void PrecompLightmapOffsets()
{
    int facenum;
    dface_t *f;
    int lightstyles;
    int lightdatasize = 0;
    
    // NOTE: We store avg face light data in this lump *before* the lightmap data itself
	// in *reverse order* of the way the lightstyles appear in the styles array.
    for( facenum = 0; facenum < numfaces; facenum++ )
    {
        f = &g_pFaces[facenum];

        if ( texinfo[f->texinfo].flags & TEX_SPECIAL)
            continue;		// non-lit texture
        
        if ( dlight_map != 0 )
            f->styles[1] = 0;
        
        for (lightstyles=0; lightstyles < MAXLIGHTMAPS; lightstyles++ )
        {
            if ( f->styles[lightstyles] == 255 )
                break;
        }
        
        if ( !lightstyles )
            continue;

        // Reserve room for the avg light color data
		lightdatasize += lightstyles * 4;

        f->lightofs = lightdatasize;

		bool needsBumpmap = false;
		if( texinfo[f->texinfo].flags & SURF_BUMPLIGHT )
		{
			needsBumpmap = true;
		}

		int nLuxels = (f->m_LightmapTextureSizeInLuxels[0]+1) * (f->m_LightmapTextureSizeInLuxels[1]+1);
		if( needsBumpmap )
		{
			lightdatasize += nLuxels * 4 * lightstyles * ( NUM_BUMP_VECTS + 1 );
		}
		else
		{
	        lightdatasize += nLuxels * 4 * lightstyles;
		}

		// Add room for additional light data here that will be packed into lightmap alpha
		lightdatasize += nLuxels * 4 * lightstyles;
	}

	// The incremental lighting code needs us to preserve the contents of dlightdata
	// since it only recomposites lighting for faces that have lights that touch them.
	if( g_pIncremental && pdlightdata->Count() )
		return;

	pdlightdata->SetSize( lightdatasize );
}

// Clamp the three values for bumped lighting such that we trade off directionality for brightness.
static void ColorClampBumped( Vector& color1, Vector& color2, Vector& color3 )
{
	Vector maxs;
	Vector *colors[3] = { &color1, &color2, &color3 };
	maxs[0] = VectorMaximum( color1 );
	maxs[1] = VectorMaximum( color2 );
	maxs[2] = VectorMaximum( color3 );

	// HACK!  Clean this up, and add some else statements
#define CONDITION(a,b,c) do { if( maxs[a] >= maxs[b] && maxs[b] >= maxs[c] ) { order[0] = a; order[1] = b; order[2] = c; } } while( 0 )
	
	int order[3];
	CONDITION(0,1,2);
	CONDITION(0,2,1);
	CONDITION(1,0,2);
	CONDITION(1,2,0);
	CONDITION(2,0,1);
	CONDITION(2,1,0);

	int i;
	for( i = 0; i < 3; i++ )
	{
		float max = VectorMaximum( *colors[order[i]] );
		if( max <= 1.0f )
		{
			continue;
		}
		// This channel is too bright. . take half of the amount that we are over and 
		// add it to the other two channel.
		float factorToRedist = ( max - 1.0f ) / max;
		Vector colorToRedist = factorToRedist * *colors[order[i]];
		*colors[order[i]] -= colorToRedist;
		colorToRedist *= 0.5f;
		*colors[order[(i+1)%3]] += colorToRedist;
		*colors[order[(i+2)%3]] += colorToRedist;
	}

	ColorClamp( color1 );
	ColorClamp( color2 );
	ColorClamp( color3 );
	
	if( color1[0] < 0.f ) color1[0] = 0.f;
	if( color1[1] < 0.f ) color1[1] = 0.f;
	if( color1[2] < 0.f ) color1[2] = 0.f;
	if( color2[0] < 0.f ) color2[0] = 0.f;
	if( color2[1] < 0.f ) color2[1] = 0.f;
	if( color2[2] < 0.f ) color2[2] = 0.f;
	if( color3[0] < 0.f ) color3[0] = 0.f;
	if( color3[1] < 0.f ) color3[1] = 0.f;
	if( color3[2] < 0.f ) color3[2] = 0.f;
}

static void LinearToBumpedLightmap(
	const float		*linearColor, 
	const float		*linearBumpColor1,
	const float		*linearBumpColor2, 
	const float		*linearBumpColor3,
	unsigned char	*ret, 
	unsigned char	*retBump1,
	unsigned char	*retBump2, 
	unsigned char	*retBump3 )
{
	const Vector &linearBump1 = *( ( const Vector * )linearBumpColor1 );
	const Vector &linearBump2 = *( ( const Vector * )linearBumpColor2 );
	const Vector &linearBump3 = *( ( const Vector * )linearBumpColor3 );

	Vector gammaGoal;
	// gammaGoal is premultiplied by 1/overbright, which we want
	gammaGoal[0] = LinearToVertexLight( linearColor[0] );
	gammaGoal[1] = LinearToVertexLight( linearColor[1] );
	gammaGoal[2] = LinearToVertexLight( linearColor[2] );
	Vector bumpAverage = linearBump1;
	bumpAverage += linearBump2;
	bumpAverage += linearBump3;
	bumpAverage *= ( 1.0f / 3.0f );
	
	Vector correctionScale;
	if( *( int * )&bumpAverage[0] != 0 && *( int * )&bumpAverage[1] != 0 && *( int * )&bumpAverage[2] != 0 )
	{
		// fast path when we know that we don't have to worry about divide by zero.
		VectorDivide( gammaGoal, bumpAverage, correctionScale );
//			correctionScale = gammaGoal / bumpSum;
	}
	else
	{
		correctionScale.Init( 0.0f, 0.0f, 0.0f );
		if( bumpAverage[0] != 0.0f )
		{
			correctionScale[0] = gammaGoal[0] / bumpAverage[0];
		}
		if( bumpAverage[1] != 0.0f )
		{
			correctionScale[1] = gammaGoal[1] / bumpAverage[1];
		}
		if( bumpAverage[2] != 0.0f )
		{
			correctionScale[2] = gammaGoal[2] / bumpAverage[2];
		}
	}
	Vector correctedBumpColor1;
	Vector correctedBumpColor2;
	Vector correctedBumpColor3;
	VectorMultiply( linearBump1, correctionScale, correctedBumpColor1 );
	VectorMultiply( linearBump2, correctionScale, correctedBumpColor2 );
	VectorMultiply( linearBump3, correctionScale, correctedBumpColor3 );

	Vector check = ( correctedBumpColor1 + correctedBumpColor2 + correctedBumpColor3 ) / 3.0f;

	ColorClampBumped( correctedBumpColor1, correctedBumpColor2, correctedBumpColor3 );

	ret[0] = RoundFloatToByte( gammaGoal[0] * 255.0f );
	ret[1] = RoundFloatToByte( gammaGoal[1] * 255.0f );
	ret[2] = RoundFloatToByte( gammaGoal[2] * 255.0f );
	retBump1[0] = RoundFloatToByte( correctedBumpColor1[0] * 255.0f );
	retBump1[1] = RoundFloatToByte( correctedBumpColor1[1] * 255.0f );
	retBump1[2] = RoundFloatToByte( correctedBumpColor1[2] * 255.0f );
	retBump2[0] = RoundFloatToByte( correctedBumpColor2[0] * 255.0f );
	retBump2[1] = RoundFloatToByte( correctedBumpColor2[1] * 255.0f );
	retBump2[2] = RoundFloatToByte( correctedBumpColor2[2] * 255.0f );
	retBump3[0] = RoundFloatToByte( correctedBumpColor3[0] * 255.0f );
	retBump3[1] = RoundFloatToByte( correctedBumpColor3[1] * 255.0f );
	retBump3[2] = RoundFloatToByte( correctedBumpColor3[2] * 255.0f );
}

//-----------------------------------------------------------------------------
// Convert a RGBExp32 to a RGBA8888
// This matches the engine's conversion, so the lighting result is consistent.
//-----------------------------------------------------------------------------
void ConvertRGBExp32ToRGBA8888( const ColorRGBExp32 *pSrc, unsigned char *pDst )
{
	Vector		linearColor;
	Vector		vertexColor;

	// convert from ColorRGBExp32 to linear space
	linearColor[0] = TexLightToLinear( ((ColorRGBExp32 *)pSrc)->r, ((ColorRGBExp32 *)pSrc)->exponent );
	linearColor[1] = TexLightToLinear( ((ColorRGBExp32 *)pSrc)->g, ((ColorRGBExp32 *)pSrc)->exponent );
	linearColor[2] = TexLightToLinear( ((ColorRGBExp32 *)pSrc)->b, ((ColorRGBExp32 *)pSrc)->exponent );

	// convert from linear space to lightmap space
	// cannot use mathlib routine directly because it doesn't match
	// the colorspace version found in the engine, which *is* the same sequence here
	vertexColor[0] = LinearToVertexLight( linearColor[0] );
	vertexColor[1] = LinearToVertexLight( linearColor[1] );
	vertexColor[2] = LinearToVertexLight( linearColor[2] );

	// this is really a color normalization with a floor
	ColorClamp( vertexColor );

	// final [0..255] scale
	pDst[0] = RoundFloatToByte( vertexColor[0] * 255.0f );
	pDst[1] = RoundFloatToByte( vertexColor[1] * 255.0f );
	pDst[2] = RoundFloatToByte( vertexColor[2] * 255.0f );
	pDst[3] = 255;
}