//========== Copyright (c) Valve Corporation, All rights reserved. ========
//
// Purpose:
//
//=============================================================================

#include "pch_materialsystem.h"

#ifndef _PS3
#define MATSYS_INTERNAL
#endif

#include "cmatlightmaps.h"

#include "colorspace.h"
#include "IHardwareConfigInternal.h"

#include "cmaterialsystem.h"

// NOTE: This must be the last file included!!!
#include "tier0/memdbgon.h"
#include "bitmap/floatbitmap.h"

static ConVar mat_lightmap_pfms( "mat_lightmap_pfms", "0", FCVAR_MATERIAL_SYSTEM_THREAD, "Outputs .pfm files containing lightmap data for each lightmap page when a level exits." ); // Write PFM files for each lightmap page in the game directory when exiting a level 

// Turning off 32 bit lightmaps for Portal 2, to save shader perf --Thorsten
//#define USE_32BIT_LIGHTMAPS_ON_360 //uncomment to use 32bit lightmaps, be sure to keep this in sync with the same #define in stdshaders/lightmappedgeneric_ps2_3_x.h

#ifdef _X360
// 7LS - fixup support for lightmap alpha channel data for csm's, definitely do this when/if turning dynamic lightmaps back on
// #define X360_USE_SIMD_LIGHTMAP
#endif

//-----------------------------------------------------------------------------

inline IMaterialInternal* CMatLightmaps::GetCurrentMaterialInternal() const
{
	return GetMaterialSystem()->GetRenderContextInternal()->GetCurrentMaterialInternal();
}

inline void CMatLightmaps::SetCurrentMaterialInternal(IMaterialInternal* pCurrentMaterial)
{
	return GetMaterialSystem()->GetRenderContextInternal()->SetCurrentMaterialInternal( pCurrentMaterial );
}

inline IMaterialInternal *CMatLightmaps::GetMaterialInternal( MaterialHandle_t idx ) const
{
	return GetMaterialSystem()->GetMaterialInternal( idx );
}

inline const IMatRenderContextInternal *CMatLightmaps::GetRenderContextInternal() const
{
	return GetMaterialSystem()->GetRenderContextInternal();
}

inline IMatRenderContextInternal *CMatLightmaps::GetRenderContextInternal()
{
	return GetMaterialSystem()->GetRenderContextInternal();
}

inline const CMaterialDict *CMatLightmaps::GetMaterialDict() const
{
	return GetMaterialSystem()->GetMaterialDict();
}

inline CMaterialDict *CMatLightmaps::GetMaterialDict()
{
	return GetMaterialSystem()->GetMaterialDict();
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
CMatLightmaps::CMatLightmaps()
{
	m_currentWhiteLightmapMaterial = NULL;
	m_pLightmapPages = NULL;
	m_NumLightmapPages = 0;
	m_numSortIDs = 0;
	m_nUpdatingLightmapsStackDepth = 0;
	m_nLockedLightmap = -1;
	m_pLightmapDataPtrArray = NULL;
	m_eLightmapsState = STATE_DEFAULT;
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
void CMatLightmaps::Shutdown( )
{
	// Clean up all lightmaps
	CleanupLightmaps();
}

//-----------------------------------------------------------------------------
// Assign enumeration IDs to all materials
//-----------------------------------------------------------------------------
void CMatLightmaps::EnumerateMaterials( void )
{
	// iterate in sorted order
	int id = 0;
	for (MaterialHandle_t i = GetMaterialDict()->FirstMaterial(); i != GetMaterialDict()->InvalidMaterial(); i = GetMaterialDict()->NextMaterial(i) )
	{
		GetMaterialInternal(i)->SetEnumerationID( id );
		++id;
	}
}


//-----------------------------------------------------------------------------
// Gets the maximum lightmap page size...
//-----------------------------------------------------------------------------
int CMatLightmaps::GetMaxLightmapPageWidth() const
{
	// FIXME: It's unclear which we want here.
	// It doesn't drastically increase primitives per DrawIndexedPrimitive
	// call at the moment to increase it, so let's not for now.
	
	// If we're using dynamic textures though, we want bigger that's for sure.
	// The tradeoff here is how much memory we waste if we don't fill the lightmap

	// We need to go to 512x256 textures because that's the only way bumped
	// lighting on displacements can work given the 128x128 allowance..
	int nWidth = 512;
	if ( nWidth > HardwareConfig()->MaxTextureWidth() )
		nWidth = HardwareConfig()->MaxTextureWidth();

	return nWidth;
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
int CMatLightmaps::GetMaxLightmapPageHeight() const
{
	int nHeight = 256;

	if ( nHeight > HardwareConfig()->MaxTextureHeight() )
		nHeight = HardwareConfig()->MaxTextureHeight();

	return nHeight;
}


//-----------------------------------------------------------------------------
// Returns the lightmap page size
//-----------------------------------------------------------------------------
void CMatLightmaps::GetLightmapPageSize( int lightmapPageID, int *pWidth, int *pHeight ) const
{
	switch( lightmapPageID )
	{
	default:
 		Assert( lightmapPageID >= 0 && lightmapPageID < GetNumLightmapPages() );
		*pWidth = m_pLightmapPages[lightmapPageID].m_Width;
		*pHeight = m_pLightmapPages[lightmapPageID].m_Height;
		break;

	case MATERIAL_SYSTEM_LIGHTMAP_PAGE_USER_DEFINED:
		*pWidth = *pHeight = 1;
		AssertOnce( !"Can't use CMatLightmaps to get properties of MATERIAL_SYSTEM_LIGHTMAP_PAGE_USER_DEFINED" );
		break;
	
	case MATERIAL_SYSTEM_LIGHTMAP_PAGE_WHITE:
	case MATERIAL_SYSTEM_LIGHTMAP_PAGE_WHITE_BUMP:
		*pWidth = *pHeight = 1;
		break;
	}
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
int CMatLightmaps::GetLightmapWidth( int lightmapPageID ) const
{
	switch( lightmapPageID )
	{
	default:
 		Assert( lightmapPageID >= 0 && lightmapPageID < GetNumLightmapPages() );
		return m_pLightmapPages[lightmapPageID].m_Width;

	case MATERIAL_SYSTEM_LIGHTMAP_PAGE_USER_DEFINED:
		AssertOnce( !"Can't use CMatLightmaps to get properties of MATERIAL_SYSTEM_LIGHTMAP_PAGE_USER_DEFINED" );
		return 1;
	
	case MATERIAL_SYSTEM_LIGHTMAP_PAGE_WHITE:
	case MATERIAL_SYSTEM_LIGHTMAP_PAGE_WHITE_BUMP:
		return 1;
	}
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
int CMatLightmaps::GetLightmapHeight( int lightmapPageID ) const
{
	switch( lightmapPageID )
	{
	default:
 		Assert( lightmapPageID >= 0 && lightmapPageID < GetNumLightmapPages() );
		return m_pLightmapPages[lightmapPageID].m_Height;

	case MATERIAL_SYSTEM_LIGHTMAP_PAGE_USER_DEFINED:
		AssertOnce( !"Can't use CMatLightmaps to get properties of MATERIAL_SYSTEM_LIGHTMAP_PAGE_USER_DEFINED" );
		return 1;
	
	case MATERIAL_SYSTEM_LIGHTMAP_PAGE_WHITE:
	case MATERIAL_SYSTEM_LIGHTMAP_PAGE_WHITE_BUMP:
		return 1;
	}
}


//-----------------------------------------------------------------------------
// Clean up lightmap pages.
//-----------------------------------------------------------------------------
void CMatLightmaps::CleanupLightmaps()
{
	GetMaterialSystem()->GetPaintmaps()->CleanupPaintmaps();
	if ( mat_lightmap_pfms.GetBool())
	{
	  // Write PFM files containing lightmap data for this page
	  for (int lightmap = 0; lightmap < GetNumLightmapPages(); lightmap++)
	  {
		 if ((NULL != m_pLightmapDataPtrArray) && (NULL != m_pLightmapDataPtrArray[lightmap]))
		 {
			char szPFMFileName[MAX_PATH];

			sprintf(szPFMFileName, "Lightmap-Page-%d.pfm", lightmap);
			m_pLightmapDataPtrArray[lightmap]->WritePFM(szPFMFileName);
		 }
	  }
	}

	// Remove the lightmap data bitmap representations
	if (m_pLightmapDataPtrArray)
	{
	  int i;
	  for( i = 0; i < GetNumLightmapPages(); i++ )
	  {
		 delete m_pLightmapDataPtrArray[i];
	  }

	  delete [] m_pLightmapDataPtrArray;
	  m_pLightmapDataPtrArray = NULL;
	}

	// delete old lightmap pages
	if( m_pLightmapPages )
	{
		int i;
		for( i = 0; i < GetNumLightmapPages(); i++ )
		{
			g_pShaderAPI->DeleteTexture( m_LightmapPageTextureHandles[i] );
		}
		delete [] m_pLightmapPages;
		m_pLightmapPages = 0;
	}

	m_NumLightmapPages = 0;
}

//-----------------------------------------------------------------------------
// Resets the lightmap page info for each material
//-----------------------------------------------------------------------------
void CMatLightmaps::ResetMaterialLightmapPageInfo( void )
{
	for (MaterialHandle_t i = GetMaterialDict()->FirstMaterial(); i != GetMaterialDict()->InvalidMaterial(); i = GetMaterialDict()->NextMaterial(i) )
	{
		IMaterialInternal *pMaterial = GetMaterialInternal(i);
		pMaterial->SetMinLightmapPageID( 9999 );
		pMaterial->SetMaxLightmapPageID( -9999 );
		pMaterial->SetNeedsWhiteLightmap( false );
	}
}

//-----------------------------------------------------------------------------
// This is called before any lightmap allocations take place
//-----------------------------------------------------------------------------
void CMatLightmaps::BeginLightmapAllocation()
{
	// we clean up lightmaps on console right before we load the next map
	if ( IsPC() )
	{
		CleanupLightmaps();
	}

	m_ImagePackers.RemoveAll();
	int i = m_ImagePackers.AddToTail();
	m_ImagePackers[i].Reset( 0, GetMaxLightmapPageWidth(), GetMaxLightmapPageHeight() );

	SetCurrentMaterialInternal(0);
	m_currentWhiteLightmapMaterial = 0;
	m_numSortIDs = 0;

	// need to set the min and max sorting id number for each material to 
	// a default value that basically means that it hasn't been used yet.
	ResetMaterialLightmapPageInfo();

	EnumerateMaterials();
}


//-----------------------------------------------------------------------------
// Allocates space in the lightmaps; must be called after BeginLightmapAllocation
//-----------------------------------------------------------------------------
int CMatLightmaps::AllocateLightmap( int width, int height, 
		                               int offsetIntoLightmapPage[2],
									   IMaterial *iMaterial )
{
	IMaterialInternal *pMaterial = static_cast<IMaterialInternal *>( iMaterial );
	if ( !pMaterial )
	{
		Warning( "Programming error: CMatRenderContext::AllocateLightmap: NULL material\n" );
		return m_numSortIDs;
	}
	pMaterial = pMaterial->GetRealTimeVersion(); //always work with the real time versions of materials internally
	
	// material change
	int i;
	int nPackCount = m_ImagePackers.Count();
	if ( GetCurrentMaterialInternal() != pMaterial )
	{
		// If this happens, then we need to close out all image packers other than
		// the last one so as to produce as few sort IDs as possible
		for ( i = nPackCount - 1; --i >= 0; )
		{
			// NOTE: We *must* use the order preserving one here so the remaining one
			// is the last lightmap
			m_ImagePackers.Remove( i );
			--nPackCount;
		}

		// If it's not the first material, increment the sort id
		if (GetCurrentMaterialInternal())
		{
			m_ImagePackers[0].IncrementSortId( );
			++m_numSortIDs;
		}

		SetCurrentMaterialInternal(pMaterial);

		// This assertion guarantees we don't see the same material twice in this loop.
		Assert( pMaterial->GetMinLightmapPageID( ) > pMaterial->GetMaxLightmapPageID() );

		// NOTE: We may not use this lightmap page, but we might
		// we won't know for sure until the next material is passed in.
		// So, for now, we're going to forcibly add the current lightmap
		// page to this material so the sort IDs work out correctly.
		GetCurrentMaterialInternal()->SetMinLightmapPageID( GetNumLightmapPages() );
		GetCurrentMaterialInternal()->SetMaxLightmapPageID( GetNumLightmapPages() );
	}

	// Try to add it to any of the current images...
	bool bAdded = false;
	for ( i = 0; i < nPackCount; ++i )
	{
		bAdded = m_ImagePackers[i].AddBlock( width, height, &offsetIntoLightmapPage[0], &offsetIntoLightmapPage[1] );
		if ( bAdded )
			break;
	}

	if ( !bAdded )
	{
		++m_numSortIDs;
		i = m_ImagePackers.AddToTail();
		m_ImagePackers[i].Reset( m_numSortIDs, GetMaxLightmapPageWidth(), GetMaxLightmapPageHeight() );
		++m_NumLightmapPages;
		if ( !m_ImagePackers[i].AddBlock( width, height, &offsetIntoLightmapPage[0], &offsetIntoLightmapPage[1] ) )
		{
			Error( "MaterialSystem_Interface_t::AllocateLightmap: lightmap (%dx%d) too big to fit in page (%dx%d)\n", 
				width, height, GetMaxLightmapPageWidth(), GetMaxLightmapPageHeight() );
		}

		// Add this lightmap to the material...
		GetCurrentMaterialInternal()->SetMaxLightmapPageID( GetNumLightmapPages() );
	}

	return m_ImagePackers[i].GetSortId();
}

// UNDONE: This needs testing, but it appears as though creating these textures managed
// results in huge stalls whenever they are locked for modify.
// That makes sense given the d3d docs, but these have been flagged as managed for quite some time.
#define DYNAMIC_TEXTURES_NO_BACKING 1

void CMatLightmaps::EndLightmapAllocation()
{
	// count the last page that we were on.if it wasn't 
	// and count the last sortID that we were on
	m_NumLightmapPages++; 
	m_numSortIDs++;

	m_firstDynamicLightmap = m_NumLightmapPages;
	// UNDONE: Until we start using the separate dynamic lighting textures don't allocate them
	// NOTE: Enable this if we want to stop locking the base lightmaps and instead only lock update
	// these completely dynamic pages
//	m_NumLightmapPages += COUNT_DYNAMIC_LIGHTMAP_PAGES;
	m_dynamic.Init();

	// Compute the dimensions of the last lightmap 
	int lastLightmapPageWidth, lastLightmapPageHeight;
	int nLastIdx = m_ImagePackers.Count();
	m_ImagePackers[nLastIdx - 1].GetMinimumDimensions( &lastLightmapPageWidth, &lastLightmapPageHeight );
	m_ImagePackers.Purge();

	m_pLightmapPages = new LightmapPageInfo_t[GetNumLightmapPages()];
	Assert( m_pLightmapPages );

   if ( mat_lightmap_pfms.GetBool())
   {
      // This array will be used to write PFM files full of lightmap data
      m_pLightmapDataPtrArray = new FloatBitMap_t*[GetNumLightmapPages()];
   }

   if( GetMaterialSystem()->GetPaintmaps()->IsEnabled() )
   {
		GetMaterialSystem()->GetPaintmaps()->BeginPaintTextureAllocation( GetNumLightmapPages() );
   }

	int i;
	m_LightmapPageTextureHandles.EnsureCapacity( GetNumLightmapPages() );
	for ( i = 0; i < GetNumLightmapPages(); i++ )
	{
		// Compute lightmap dimensions
		bool lastStaticLightmap = ( i == (m_firstDynamicLightmap-1));
		m_pLightmapPages[i].m_Width = (unsigned short)(lastStaticLightmap ? lastLightmapPageWidth : GetMaxLightmapPageWidth());
		m_pLightmapPages[i].m_Height = (unsigned short)(lastStaticLightmap ? lastLightmapPageHeight : GetMaxLightmapPageHeight());
		m_pLightmapPages[i].m_Flags = 0;

		AllocateLightmapTexture( i );
		
		if ( GetMaterialSystem()->GetPaintmaps()->IsEnabled() )
		{
			GetMaterialSystem()->GetPaintmaps()->AllocatePaintmap( i, GetLightmapWidth(i), GetLightmapHeight(i) );
		}

        if ( mat_lightmap_pfms.GetBool())
        {
           // Initialize the pointers to lightmap data
           m_pLightmapDataPtrArray[i] = NULL;
        }
	}

	if( GetMaterialSystem()->GetPaintmaps()->IsEnabled() )
	{
		GetMaterialSystem()->GetPaintmaps()->EndPaintTextureAllocation();
	}
}


ConVar mat_dynamiclightmaps( "mat_dynamiclightmaps", "0", FCVAR_CHEAT );

//-----------------------------------------------------------------------------
// Allocate lightmap textures
//-----------------------------------------------------------------------------
void CMatLightmaps::AllocateLightmapTexture( int lightmap )
{
	bool bUseDynamicTextures = HardwareConfig()->PreferDynamicTextures() && mat_dynamiclightmaps.GetBool();

	int flags = 0;
	if ( bUseDynamicTextures || IsPS3() ) // On PS3, we need the dynamic flag as a hint that we're going to update this texture incrementally in the future
	{
		flags |= TEXTURE_CREATE_DYNAMIC;
	}
	else
	{
		flags |= TEXTURE_CREATE_MANAGED;
	}

	int nPreviousTextureHandles = m_LightmapPageTextureHandles.Count();
	m_LightmapPageTextureHandles.EnsureCount( lightmap + 1 );
	for ( int nLightmap = nPreviousTextureHandles; nLightmap <= lightmap; ++nLightmap )
	{
		m_LightmapPageTextureHandles[ nLightmap ] = INVALID_SHADERAPI_TEXTURE_HANDLE;
	}

	char debugName[256];
	Q_snprintf( debugName, sizeof( debugName ), "[lightmap %d]", lightmap );
	
	ImageFormat imageFormat;
	switch ( HardwareConfig()->GetHDRType() )
	{
	default:
		Assert( 0 );
		// fall through.

	case HDR_TYPE_NONE:
#if !defined( _X360 )
		imageFormat = IMAGE_FORMAT_RGBA8888;
		flags |= TEXTURE_CREATE_SRGB;
#else
		imageFormat = IMAGE_FORMAT_LINEAR_RGBA8888;
#endif
		break;

	case HDR_TYPE_INTEGER:
#if !defined( _X360 )
		imageFormat = IMAGE_FORMAT_RGBA16161616;
#else
#		if ( defined( USE_32BIT_LIGHTMAPS_ON_360 ) )
			imageFormat = IMAGE_FORMAT_LINEAR_RGBA8888;
#		else
			imageFormat = IMAGE_FORMAT_LINEAR_RGBA16161616;
#		endif
#endif
		break;

	case HDR_TYPE_FLOAT:
		imageFormat = IMAGE_FORMAT_RGBA16161616F;
		break;
	}

#ifdef _PS3
	// PS3 needs 16F textures...but the HDR_TYPE_FLOAT codepath has a lot of other baggage with it.  Just lie here.
	imageFormat = IMAGE_FORMAT_RGBA16161616F;

#endif // _PS3


	switch ( m_eLightmapsState )
	{
	case STATE_DEFAULT:
		// Allow allocations in default state
		{
			int iWidth = GetLightmapWidth(lightmap);
			int iHeight = GetLightmapHeight(lightmap);

			m_LightmapPageTextureHandles[lightmap] = g_pShaderAPI->CreateTexture( 
				iWidth, iHeight, 1,
				imageFormat, 
				1, 1, flags, debugName, TEXTURE_GROUP_LIGHTMAP );	// don't mipmap lightmaps

			// Load up the texture data
			g_pShaderAPI->ModifyTexture( m_LightmapPageTextureHandles[lightmap] );
			g_pShaderAPI->TexMinFilter( SHADER_TEXFILTERMODE_LINEAR );
			g_pShaderAPI->TexMagFilter( SHADER_TEXFILTERMODE_LINEAR );

			if ( !bUseDynamicTextures )
			{
				g_pShaderAPI->TexSetPriority( 1 );
			}

			// Blat out the lightmap bits
			InitLightmapBits( lightmap );
		}
		break;

	case STATE_RELEASED:
		// Not assigned m_LightmapPageTextureHandles[lightmap];
		DevMsg( "AllocateLightmapTexture(%d) in released lightmap state (STATE_RELEASED), delayed till \"Restore\".\n", lightmap );
		return;

	default:
		// Not assigned m_LightmapPageTextureHandles[lightmap];
		Warning( "AllocateLightmapTexture(%d) in unknown lightmap state (%d), skipped.\n", lightmap, m_eLightmapsState );
		Assert( !"AllocateLightmapTexture(?) in unknown lightmap state (?)" );
		return;
	}
}


int	CMatLightmaps::AllocateWhiteLightmap( IMaterial *iMaterial )
{
	IMaterialInternal *pMaterial = static_cast<IMaterialInternal *>( iMaterial );
	if( !pMaterial )
	{
		Warning( "Programming error: CMatRenderContext::AllocateWhiteLightmap: NULL material\n" );
		return m_numSortIDs;
	}
	pMaterial = pMaterial->GetRealTimeVersion(); //always work with the real time versions of materials internally

	if ( !m_currentWhiteLightmapMaterial || ( m_currentWhiteLightmapMaterial != pMaterial ) )
	{
		if ( !GetCurrentMaterialInternal() && !m_currentWhiteLightmapMaterial )
		{
			// don't increment if this is the very first material (ie. no lightmaps
			// allocated with AllocateLightmap
			// Assert( 0 );
		}
		else
		{
			// material change
			m_numSortIDs++;
#if 0
			char buf[128];
			Q_snprintf( buf, sizeof( buf ), "AllocateWhiteLightmap: m_numSortIDs = %d %s\n", m_numSortIDs, pMaterial->GetName() );
			OutputDebugString( buf );
#endif
		}
//		Warning( "%d material: \"%s\" lightmapPageID: -1\n", m_numSortIDs, pMaterial->GetName() );
		m_currentWhiteLightmapMaterial = pMaterial;
		pMaterial->SetNeedsWhiteLightmap( true );
	}

	return m_numSortIDs;
}

//-----------------------------------------------------------------------------
// Releases/restores lightmap pages
//-----------------------------------------------------------------------------
void CMatLightmaps::ReleaseLightmapPages()
{
	switch ( m_eLightmapsState )
	{
	case STATE_DEFAULT:
		// Allow release in default state only
		break;
	
	default:
		Warning( "ReleaseLightmapPages is expected in STATE_DEFAULT, current state = %d, discarded.\n", m_eLightmapsState );
		Assert( !"ReleaseLightmapPages is expected in STATE_DEFAULT" );
		return;
	}

	for( int i = 0; i < GetNumLightmapPages(); i++ )
	{
		g_pShaderAPI->DeleteTexture( m_LightmapPageTextureHandles[i] );
	}

	GetMaterialSystem()->GetPaintmaps()->ReleasePaintmaps();
	
	// We are now in released state
	m_eLightmapsState = STATE_RELEASED;
}

void CMatLightmaps::RestoreLightmapPages()
{
	switch ( m_eLightmapsState )
	{
	case STATE_RELEASED:
		// Allow restore in released state only
		break;

	default:
		Warning( "RestoreLightmapPages is expected in STATE_RELEASED, current state = %d, discarded.\n", m_eLightmapsState );
		Assert( !"RestoreLightmapPages is expected in STATE_RELEASED" );
		return;
	}

	// Switch to default state to allow allocations
	m_eLightmapsState = STATE_DEFAULT;

	if( GetMaterialSystem()->GetPaintmaps()->IsEnabled() )
	{
		GetMaterialSystem()->GetPaintmaps()->RestorePaintmaps( GetNumLightmapPages() );
	}

	for( int i = 0; i < GetNumLightmapPages(); i++ )
	{
		AllocateLightmapTexture( i );
	}
}


//-----------------------------------------------------------------------------
// This initializes the lightmap bits
//-----------------------------------------------------------------------------
void CMatLightmaps::InitLightmapBits( int lightmap )
{
	VPROF_( "CMatLightmaps::InitLightmapBits", 1, VPROF_BUDGETGROUP_DLIGHT_RENDERING, false, 0 );
	int width = GetLightmapWidth(lightmap);
	int height = GetLightmapHeight(lightmap);

	CPixelWriter writer;

	g_pShaderAPI->ModifyTexture( m_LightmapPageTextureHandles[lightmap] );
	if ( !g_pShaderAPI->TexLock( 0, 0, 0, 0, width, height, writer ) )
		return;

	// Debug mode, make em green checkerboard
	if ( writer.IsUsingFloatFormat() )
	{
		for ( int j = 0; j < height; ++j )
		{
			writer.Seek( 0, j );
			for ( int k = 0; k < width; ++k )
			{
#ifndef _DEBUG
				writer.WritePixel( 1.0f, 1.0f, 1.0f );
#else // _DEBUG
				if( ( j + k ) & 1 )
				{
					writer.WritePixelF( 0.0f, 1.0f, 0.0f );
				}
				else
				{
					writer.WritePixelF( 0.0f, 0.0f, 0.0f );
				}
#endif // _DEBUG
			}
		}
	}
	else
	{
#if defined( _X360 ) && defined( _DEBUG )
		float vGreenData[4] =  { 0.0f, 2.0f, 0.0f, 0.0f };
		fltx4 vGreen = LoadUnalignedSIMD( vGreenData );
#endif
		for ( int j = 0; j < height; ++j )
		{
			writer.Seek( 0, j );
			for ( int k = 0; k < width; ++k )
			{
#ifndef _DEBUG
				// note: make this white to find multisample centroid sampling problems.
				//				writer.WritePixel( 255, 255, 255 );
				#ifdef _X360
				{
					writer.WritePixel( Four_Zeros );
				}
				#else
				{
					writer.WritePixel( 0, 0, 0 );
				}
				#endif
#else // _DEBUG
				#ifdef _X360
				{
					if ( ( j + k ) & 1 )
					{
						writer.WritePixel( vGreen );
					}
					else
					{
						writer.WritePixel( Four_Zeros );
					}
				}
				#else
				{
					if ( ( j + k ) & 1 )
					{
						writer.WritePixel( 0, 255, 0 );
					}
					else
					{
						writer.WritePixel( 0, 0, 0 );
					}
				}
				#endif // _X360
#endif // _DEBUG
			}
		}
	}

	g_pShaderAPI->TexUnlock();
}

bool CMatLightmaps::LockLightmap( int lightmap )
{
//	Warning( "locking lightmap page: %d\n", lightmap );
	VPROF_INCREMENT_COUNTER( "lightmap fullpage texlock", 1 );
	if( m_nLockedLightmap != -1 )
	{
		g_pShaderAPI->TexUnlock();
	}
	g_pShaderAPI->ModifyTexture( m_LightmapPageTextureHandles[lightmap] );
	int pageWidth  = m_pLightmapPages[lightmap].m_Width;
	int pageHeight = m_pLightmapPages[lightmap].m_Height;
	if (!g_pShaderAPI->TexLock( 0, 0, 0, 0,	pageWidth, pageHeight, m_LightmapPixelWriter ))
	{
		Assert( 0 );
		return false;
	}
	m_nLockedLightmap = lightmap;
	return true;
}

Vector4D ConvertLightmapColorToRGBScale( const float *lightmapColor )
{
	Vector4D result;


	float fScale = lightmapColor[0];
	for( int i = 1; i != 3; ++i )
	{
		if( lightmapColor[i] > fScale )
			fScale = lightmapColor[i];
	}

	fScale = ceil( fScale * (255.0f/16.0f) ) * (16.0f/255.0f);
	fScale = MIN( fScale, 16.0f );

	float fInvScale = 1.0f / fScale;

	for( int i = 0; i != 3; ++i )
	{
		result[i] = lightmapColor[i] * fInvScale;
		result[i] = ceil( result[i] * 255.0f ) * (1.0f/255.0f);
		result[i] = MIN( result[i], 1.0f );
	}

	fScale /= 16.0f;

	result.w = fScale;

	return result;
}

#ifdef _X360
// SIMD version of above
// input numbers from pSrc are on the domain [0..16]
// output is RGBA 
// ignores contents of w channel of input
// the shader does this: rOut = Rin * Ain * 16.0f 
// where Rin is [0..1], a float computed from a byte value [0..255]
// Ain is therefore the brightest channel (say R) divided by 16 and quantized
// Rin is computed from pSrc->r by dividing by Ain
// this outputs RGBa where RGB are [0..255] and a is the shader's scaling factor (also 0..255)
//
// WARNING - this code appears to be vulnerable to a compiler bug. Be very careful modifying and be
// sure to test
fltx4 ConvertLightmapColorToRGBScale( FLTX4 lightmapColor )
{
	
	static const fltx4 vTwoFiftyFive = {255.0f, 255.0f, 255.0f, 255.0f};
	static const fltx4 FourPoint1s = { 0.1, 0.1, 0.1, 0.1 };
	static const fltx4 vTwoFiftyFiveOverSixteen = {255.0f / 16.0f, 255.0f / 16.0f, 255.0f / 16.0f, 255.0f / 16.0f};
	// static const fltx4 vSixteenOverTwoFiftyFive = { 16.0f / 255.0f, 16.0f / 255.0f, 16.0f / 255.0f, 16.0f / 255.0f };


	// find the highest color value in lightmapColor and replicate it
	fltx4 scale = FindHighestSIMD3( lightmapColor );
	fltx4 minscale = FindLowestSIMD3( lightmapColor );
	fltx4 fl4OutofRange = OrSIMD( CmpGeSIMD( scale, Four_Ones ), CmpLeSIMD( scale, FourPoint1s ) );
	fl4OutofRange = OrSIMD( fl4OutofRange, CmpGtSIMD( minscale, MulSIMD( Four_PointFives, scale ) ) );

	// scale needs to be divided by 16 (because the shader multiplies it by 16)
	// then mapped to 0..255 and quantized. 
	scale = __vrfip(MulSIMD(scale, vTwoFiftyFiveOverSixteen)); // scale = ceil(scale * 255/16)
		
	fltx4 result = MulSIMD(vTwoFiftyFive, lightmapColor); // start the scale cooking on the final result
		
	fltx4 invScale = ReciprocalEstSIMD(scale); // invScale = (16/255)(1/scale). may be +inf
	invScale = MulSIMD(invScale, vTwoFiftyFiveOverSixteen); // take the quantizing factor back out
															// of the inverse scale (one less
															// dependent op if you do it this way)
		
	// scale the input channels
	// compute so the numbers are all 0..255 ints. (if one happens to 
	// be 256 due to numerical error in the reciprocation, the unsigned-saturate
	// store we'll use later on will bake it back down to 255)
	result = MulSIMD(result, invScale);
		
	// now, output --
	// if the input color was nonzero, slip the scale into return value's w
	// component and return. If the input was zero, return zero.

	result = MaskedAssign( 
		fl4OutofRange,
		SetWSIMD( result, scale ),
		SetWSIMD( MulSIMD( lightmapColor, vTwoFiftyFive ), vTwoFiftyFiveOverSixteen ) );
	return result;
}
#endif


// write bumped lightmap update to LDR 8-bit lightmap
void CMatLightmaps::BumpedLightmapBitsToPixelWriter_LDR( float* pFloatImage, float *pFloatImageBump1, float *pFloatImageBump2, 
	float *pFloatImageBump3, int pLightmapSize[2], int pOffsetIntoLightmapPage[2], FloatBitMap_t *pfmOut )
{
	const int nLightmapSize0 = pLightmapSize[0];
	const int nLightmap0WriterSizeBytes = nLightmapSize0 * m_LightmapPixelWriter.GetPixelSize();
	const int nRewindToNextPixel = -( ( nLightmap0WriterSizeBytes * 3 ) - m_LightmapPixelWriter.GetPixelSize() );

	for( int t = 0; t < pLightmapSize[1]; t++ )
	{
		int srcTexelOffset = ( sizeof( Vector4D ) / sizeof( float ) ) * ( 0 + t * nLightmapSize0 );
		m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );

		for( int s = 0; s < nLightmapSize0; 
			s++, m_LightmapPixelWriter.SkipBytes(nRewindToNextPixel),srcTexelOffset += (sizeof(Vector4D)/sizeof(float)))
		{
			unsigned char color[4][4];

			ColorSpace::LinearToBumpedLightmap( &pFloatImage[srcTexelOffset],
				&pFloatImageBump1[srcTexelOffset], &pFloatImageBump2[srcTexelOffset],
				&pFloatImageBump3[srcTexelOffset],
				color[0], color[1], color[2], color[3] );

			if ( HardwareConfig()->GetCSMAccurateBlending() )
			{
				ColorSpace::LinearToBumpedLightmapAlpha( &pFloatImage[srcTexelOffset + 3], 
														 &pFloatImageBump1[srcTexelOffset + 3], &pFloatImageBump2[srcTexelOffset + 3], &pFloatImageBump3[srcTexelOffset + 3],
														 &color[0][3], &color[1][3], &color[2][3], &color[3][3] );
			}
			else
			{
				unsigned char alpha =  RoundFloatToByte( pFloatImage[srcTexelOffset+3] * 255.0f );
				color[0][3] = alpha;
				color[1][3] = alpha;
				color[2][3] = alpha;
				color[3][3] = alpha;
			}

			m_LightmapPixelWriter.WritePixelNoAdvance( color[0][0], color[0][1], color[0][2], color[0][3] );

			m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
			m_LightmapPixelWriter.WritePixelNoAdvance( color[1][0], color[1][1], color[1][2], color[1][3] );

			m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
			m_LightmapPixelWriter.WritePixelNoAdvance( color[2][0], color[2][1], color[2][2], color[2][3] );

			m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
			m_LightmapPixelWriter.WritePixelNoAdvance( color[3][0], color[3][1], color[3][2], color[3][3] );
		}
	}
	if ( pfmOut )
	{
		for( int t = 0; t < pLightmapSize[1]; t++ )
		{
			int srcTexelOffset = ( sizeof( Vector4D ) / sizeof( float ) ) * ( 0 + t * nLightmapSize0 );
			for( int s = 0;  s < nLightmapSize0; s++,srcTexelOffset += (sizeof(Vector4D)/sizeof(float)))
			{
				unsigned char color[4][4];

				ColorSpace::LinearToBumpedLightmap( &pFloatImage[srcTexelOffset],
					&pFloatImageBump1[srcTexelOffset], &pFloatImageBump2[srcTexelOffset],
					&pFloatImageBump3[srcTexelOffset],
					color[0], color[1], color[2], color[3] );

				unsigned char alpha =  RoundFloatToByte( pFloatImage[srcTexelOffset+3] * 255.0f );
				// Write data to the bitmapped represenations so that PFM files can be written
				PixRGBAF pixelData;
				pixelData.Red = color[0][0];                  
				pixelData.Green = color[0][1];                  
				pixelData.Blue = color[0][2];
				pixelData.Alpha = alpha;
				pfmOut->WritePixelRGBAF( pOffsetIntoLightmapPage[0] + s, pOffsetIntoLightmapPage[1] + t, 0, pixelData);
			}
		}

	}
}

// write bumped lightmap update to HDR float lightmap
void CMatLightmaps::BumpedLightmapBitsToPixelWriter_HDRF( float* pFloatImage, float *pFloatImageBump1, float *pFloatImageBump2, 
												 float *pFloatImageBump3, int pLightmapSize[2], int pOffsetIntoLightmapPage[2], FloatBitMap_t *pfmOut )
{
	if ( IsX360() )
	{
		// 360 does not support HDR float mode 
		Assert( 0 );
		return;
	}

	Assert( !pfmOut );		// unsupported in this mode

	const int nLightmapSize0 = pLightmapSize[0];
	const int nLightmap0WriterSizeBytes = nLightmapSize0 * m_LightmapPixelWriter.GetPixelSize();
	const int nRewindToNextPixel = -( ( nLightmap0WriterSizeBytes * 3 ) - m_LightmapPixelWriter.GetPixelSize() );

	for( int t = 0; t < pLightmapSize[1]; t++ )
	{
		int srcTexelOffset = ( sizeof( Vector4D ) / sizeof( float ) ) * ( 0 + t * nLightmapSize0 );
		m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );

		// if it's anything but 4 x float16 on a PPC...
 		/*
		// The 'else' path uses ConvertFourFloatsTo16BitsAtOnce which is entirely broken
		// so we need to always use the main path.
		if ( !IsGameConsole() ||
 			 !(m_LightmapPixelWriter.GetPixelSize() == 4*sizeof(unsigned short)) ||
 			 !(m_LightmapPixelWriter.IsUsing16BitFloatFormat())						
			 )*/
		{
			for( int s = 0; 
				s < nLightmapSize0; 
				s++, m_LightmapPixelWriter.SkipBytes(nRewindToNextPixel),srcTexelOffset += (sizeof(Vector4D)/sizeof(float)))
			{
				float color[4][4];

				// [mariod] - LinearToBumpedLightmap() was entirely missing in the float path as of September '11
				// looks like this only affected PS3 (PC/X360 use linear 16bit tex formats)
				ColorSpace::LinearToBumpedLightmap( &pFloatImage[srcTexelOffset],
					&pFloatImageBump1[srcTexelOffset], &pFloatImageBump2[srcTexelOffset],
					&pFloatImageBump3[srcTexelOffset],
					color[0], color[1], color[2], color[3] );

				if ( HardwareConfig()->GetCSMAccurateBlending() )
				{
					ColorSpace::LinearToBumpedLightmapAlpha( &pFloatImage[srcTexelOffset + 3],
															 &pFloatImageBump1[srcTexelOffset + 3], &pFloatImageBump2[srcTexelOffset + 3], &pFloatImageBump3[srcTexelOffset + 3],
															 &color[0][3], &color[1][3], &color[2][3], &color[3][3] );
				}
				else
				{
					float alpha = pFloatImage[srcTexelOffset+3];
					color[0][3] = alpha;
					color[1][3] = alpha;
					color[2][3] = alpha;
					color[3][3] = alpha;
				}

				m_LightmapPixelWriter.WritePixelNoAdvanceF( color[0][0], color[0][1], color[0][2], color[0][3] );

				m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
				m_LightmapPixelWriter.WritePixelNoAdvanceF( color[1][0], color[1][1], color[1][2], color[1][3] );

				m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
				m_LightmapPixelWriter.WritePixelNoAdvanceF( color[2][0], color[2][1], color[2][2], color[2][3] );

				m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
				m_LightmapPixelWriter.WritePixelNoAdvanceF( color[3][0], color[3][1], color[3][2], color[3][3] );
			}
		}
		/*
		else // use a faster technique on PPC cores for float16 lightmaps, that's not so branchy and load-hit-store-y
		{
			for( int s = 0; 
				s < nLightmapSize0; 
				s++, m_LightmapPixelWriter.SkipBytes(nRewindToNextPixel),srcTexelOffset += (sizeof(Vector4D)/sizeof(float)))
			{

				float color[4][4];

				// [mariod] - LinearToBumpedLightmap() was entirely missing in the float path as of September '11
				// looks like this only affected PS3 (PC/X360 use linear 16bit tex formats)
				ColorSpace::LinearToBumpedLightmap( &pFloatImage[srcTexelOffset],
					&pFloatImageBump1[srcTexelOffset], &pFloatImageBump2[srcTexelOffset],
					&pFloatImageBump3[srcTexelOffset],
					color[0], color[1], color[2], color[3] );

				float alpha = pFloatImage[srcTexelOffset+3];

				float16::ConvertFourFloatsTo16BitsAtOnce( (float16*) m_LightmapPixelWriter.GetCurrentPixel(),
					&color[0][0], &color[0][1], &color[0][2], &alpha );

				m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
				float16::ConvertFourFloatsTo16BitsAtOnce( (float16*) m_LightmapPixelWriter.GetCurrentPixel(),
					&color[1][0], &color[1][1], &color[1][2], &alpha );

				m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
				float16::ConvertFourFloatsTo16BitsAtOnce( (float16*) m_LightmapPixelWriter.GetCurrentPixel(),
					&color[2][0], &color[2][1], &color[2][2], &alpha );

				m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
				float16::ConvertFourFloatsTo16BitsAtOnce( (float16*) m_LightmapPixelWriter.GetCurrentPixel(),
					&color[3][0], &color[3][1], &color[3][2], &alpha );
			}
		}
		*/
	}
}

#ifdef _X360
#pragma optimize("u", on)
#endif


#ifdef _X360

namespace {
	// pack a pixel into BGRA8888 and return it with the data packed into the w component
FORCEINLINE fltx4 PackPixel_BGRA8888( FLTX4 rgba ) 
{
	// this happens to be in an order such that we can use the handy builtin packing op
	// clamp to 0..255 (coz it might have leaked over)
	static const fltx4 vTwoFiftyFive = {255.0f, 255.0f, 255.0f, 255.0f};

	// the magic number such that when mul-accummulated against rbga,
	// gets us a representation 3.0 + (r)*2^-22 -- puts the bits at
	// the bottom of the float
	static const XMVECTOR   PackScale = { (1.0f / (FLOAT)(1 << 22)), (1.0f / (FLOAT)(1 << 22)), (1.0f / (FLOAT)(1 << 22)), (1.0f / (FLOAT)(1 << 22))}; // 255.0f / (FLOAT)(1 << 22)
	static const XMVECTOR   Three = {3.0f, 3.0f, 3.0f, 3.0f};

	fltx4 N = MinSIMD(vTwoFiftyFive, rgba); 

	N = __vmaddfp(N, PackScale, Three);
	N = __vpkd3d(N, N, VPACK_D3DCOLOR, VPACK_32, 0);  // pack into w word
	return N;
}

// A small store-gather buffer used in the 
// BumpedLightmapBitsToPixelWriter_HDRI_BGRA_X360().
// The store-gather buffers. Hopefully these will live in the L1
// cache, which will make writing to them, then to memory, faster
// than just using __stvewx to write directly into WC memory
// one noncontiguous float at a time. (If there weren't a huge
// compiler bug with __stvewx in the Apr07 XDK, that might not
// be the case.)
struct ALIGN128 CPixelWriterStoreGather
{
	enum {
		kRows = 4,
		kWordsPerRow = 32,
	};

	ALIGN128 uint32 m_data[kRows][kWordsPerRow]; // four rows of bgra data, aligned to 4 cache lines. dwords so memcpy works better.
	int m_wordsGathered;
	int m_bytesBetweenWriterRows; // the number of bytes spacing the maps inside the writer from each other
								// if we weren't gathering, we'd SkipBytes this many between the base map, bump1, etc.

	// write four rows, as SIMD registers, into the buffers
	inline void write( CPixelWriter * RESTRICT pLightmapPixelWriter, FLTX4 row0,  FLTX4 row1,  FLTX4 row2,  FLTX4 row3 ) RESTRICT
	{
		// if full, commit
		Assert(m_wordsGathered <= kWordsPerRow);
		AssertMsg((m_wordsGathered & 3) == 0, "Don't call CPixelWriterStoreGather::write after ::writeJustX"); // single-word writes have misaligned me
		if (m_wordsGathered >= kWordsPerRow)
		{
			commitWhenFull(pLightmapPixelWriter);
		}

		XMStoreVector4A( &m_data[0][m_wordsGathered], row0 );
		XMStoreVector4A( &m_data[1][m_wordsGathered], row1 );
		XMStoreVector4A( &m_data[2][m_wordsGathered], row2 );
		XMStoreVector4A( &m_data[3][m_wordsGathered], row3 );

		m_wordsGathered += 4 ; // four words per simd vec
	}

	// pluck the w component out of each of the rows, and store it into the gather buffer. Don't
	// call the other write function after calling this.
	inline void writeJustW( CPixelWriter * RESTRICT pLightmapPixelWriter, FLTX4 row0,  FLTX4 row1,  FLTX4 row2,  FLTX4 row3 ) RESTRICT
	{
		// if full, commit
		Assert(m_wordsGathered <= kWordsPerRow);
		if (m_wordsGathered >= kWordsPerRow)
		{
			commitWhenFull(pLightmapPixelWriter);
		}

		// for each fltx4, splat out x and then use the __stvewx to store
		// whichever word happens to align with the float pointer through
		// that pointer.

		__stvewx(__vspltw(row0, 3), &m_data[0][m_wordsGathered], 0 );
		__stvewx(__vspltw(row1, 3), &m_data[1][m_wordsGathered], 0 );
		__stvewx(__vspltw(row2, 3), &m_data[2][m_wordsGathered], 0 );
		__stvewx(__vspltw(row3, 3), &m_data[3][m_wordsGathered], 0 );

		m_wordsGathered += 1 ; // only stored one word
	}

	// Commit my buffers to the pixelwriter's memory, and advance its
	// pointer.
	void commit(CPixelWriter * RESTRICT pLightmapPixelWriter) RESTRICT
	{
		if (m_wordsGathered > 0)
		{
			unsigned char* RESTRICT pWriteInto = pLightmapPixelWriter->GetCurrentPixel();
			// we have to use memcpy because we're writing to non-cacheable memory,
			// but we can't even assume that the addresses we're writing to are
			// vector-aligned.
#ifdef memcpy // if someone's overriden the intrinsic, complain
#pragma error("You have overridden memcpy(), which is an XBOX360 intrinsic. This function will not behave optimally.")
#endif

			memcpy(pWriteInto, m_data[0], m_wordsGathered * sizeof(uint32));
			pWriteInto += m_bytesBetweenWriterRows;
			memcpy(pWriteInto, m_data[1], m_wordsGathered * sizeof(uint32));
			pWriteInto += m_bytesBetweenWriterRows;
			memcpy(pWriteInto, m_data[2], m_wordsGathered * sizeof(uint32));
			pWriteInto += m_bytesBetweenWriterRows;
			memcpy(pWriteInto, m_data[3], m_wordsGathered * sizeof(uint32));

			pLightmapPixelWriter->SkipBytes(m_wordsGathered * sizeof(uint32));
			m_wordsGathered = 0;
		}
	}

	// like commit, but the version we use when we know we're full.
	// Takes advantage of better compile-time generation for 
	// memcpy.
	void commitWhenFull(CPixelWriter * RESTRICT pLightmapPixelWriter) RESTRICT
	{
		unsigned char* RESTRICT pWriteInto = pLightmapPixelWriter->GetCurrentPixel();
		// we have to use memcpy because we're writing to non-cacheable memory,
		// but we can't even assume that the addresses we're writing to are
		// vector-aligned.
#ifdef memcpy // if someone's overriden the intrinsic, complain
#pragma error("You have overridden memcpy(), which is an XBOX360 intrinsic. This function will not behave optimally.")
#endif

		// if we're full, use compile-time known version of 
		// mempcy to take advantage of its ability to generate
		// inline code. In fact, use the dword-aligned
		// version so that we use the 64-bit writing funcs.
		Assert( m_wordsGathered == kWordsPerRow );
		COMPILE_TIME_ASSERT((kWordsPerRow & 3) == 0); // the number of words per row has to be a multiple of four
		
		memcpy(pWriteInto, reinterpret_cast<uint64* RESTRICT>(m_data[0]), kWordsPerRow * sizeof(uint32));
		pWriteInto += m_bytesBetweenWriterRows;
		memcpy(pWriteInto, reinterpret_cast<uint64* RESTRICT>(m_data[1]), kWordsPerRow * sizeof(uint32));
		pWriteInto += m_bytesBetweenWriterRows;
		memcpy(pWriteInto, reinterpret_cast<uint64* RESTRICT>(m_data[2]), kWordsPerRow * sizeof(uint32));
		pWriteInto += m_bytesBetweenWriterRows;
		memcpy(pWriteInto, reinterpret_cast<uint64* RESTRICT>(m_data[3]), kWordsPerRow * sizeof(uint32));
		
		pLightmapPixelWriter->SkipBytes(m_wordsGathered * sizeof(uint32));
		m_wordsGathered = 0;
	}

	// parameter: space between bump pages in the pixelwriter
	CPixelWriterStoreGather(int writerSizeBytes) : m_wordsGathered(0), m_bytesBetweenWriterRows(writerSizeBytes) {};

};
}


// this is a function for specifically writing bumped BGRA lightmaps -- in order for it
// to be properly scheduled, I needed to break out the inline functions. Also,
// to make the write-combined memory more efficient (and work around a bug in the
// April 2007 XDK), we need to store-gather our writes on the cache before blasting
// them out to write-combined memory. We can't simply write from the SIMD registers
// into the pixelwriter's data, because the difference between the output rows,
// eg nLightmap0WriterSizeBytes[0], might not be a multiple of 16. Unaligned stores
// to non-cacheable memory cause an alignment exception.
static void BumpedLightmapBitsToPixelWriter_HDRI_BGRA_X360( float* RESTRICT pFloatImage, float * RESTRICT pFloatImageBump1, float * RESTRICT pFloatImageBump2, 
													  float * RESTRICT pFloatImageBump3, int pLightmapSize[2], int pOffsetIntoLightmapPage[2], FloatBitMap_t *pfmOut,
													  CPixelWriter * RESTRICT m_LightmapPixelWriter)
{
	AssertMsg(m_LightmapPixelWriter->GetPixelSize() == 4, "BGRA format is no longer four bytes long? This is unsupported on 360, and probably immoral as well.");
	const int nLightmap0WriterSizeBytes = pLightmapSize[0] * 4 /*m_LightmapPixelWriter->GetPixelSize()*/;
	// const int nRewindToNextPixel = -( ( nLightmap0WriterSizeBytes * 3 ) - 4 );

	// assert that 1 * 4 = 4 
	COMPILE_TIME_ASSERT(sizeof( Vector4D ) == sizeof(float) * 4); 

	AssertMsg(!pfmOut, "Runtime conversion of lightmaps to files is no longer supported on 360.\n");

	
	// The store-gather buffers. Hopefully these will live in the L1
	// cache, which will make writing to them, then to memory, faster
	// than just using __stvewx to write directly into WC memory
	// one noncontiguous float at a time. (If there weren't a huge
	// compiler bug with __stvewx in the Apr07 XDK, that might not
	// be the case.)
	CPixelWriterStoreGather storeGather(nLightmap0WriterSizeBytes);

	for( int t = 0; t < pLightmapSize[1]; t++ )
	{
#define	FOUR (sizeof( Vector4D ) / sizeof( float ))  //  make explicit when we're incrementing by length of a 4dvec
		int srcTexelOffset = ( FOUR ) * ( 0 + t * pLightmapSize[0] );
		m_LightmapPixelWriter->Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );

		// Our code works best when we can process luxels in groups of four. So,
		// figure out how many four-luxel groups we can process,
		// then do them in groups, then process the remainder.
		unsigned int groupsOfFourLimit = (((unsigned int)pLightmapSize[0]) & ~3);
		
		// we want to hang on to this index when we're done with groups so we can do the remainder.
		unsigned int s; // counts the number of luxels processed
		for( s = 0; 
			s < groupsOfFourLimit; 
			s += 4, srcTexelOffset += 4 * ( FOUR ))
		{				
			static const fltx4 vSixteen = {16.0f, 16.0f, 16.0f, 16.0f};
			// the store-gather simds
			fltx4 outBaseMap = Four_Zeros, outBump1 = Four_Zeros, outBump2 = Four_Zeros, outBump3 = Four_Zeros;
			// we'll read four at a time
			fltx4 vFloatImage[4], vFloatImageBump1[4], vFloatImageBump2[4], vFloatImageBump3[4];


			// stripe these loads to cause less ERAT thrashing
			vFloatImage[0]	  = LoadUnalignedSIMD(pFloatImage	   + srcTexelOffset );
			vFloatImage[1]	  = LoadUnalignedSIMD(pFloatImage	   + srcTexelOffset + 4 );
			vFloatImage[2]	  = LoadUnalignedSIMD(pFloatImage	   + srcTexelOffset + 8 );
			vFloatImage[3]	  = LoadUnalignedSIMD(pFloatImage	   + srcTexelOffset + 12 );

			vFloatImageBump1[0] = LoadUnalignedSIMD(pFloatImageBump1 + srcTexelOffset );
			vFloatImageBump1[1] = LoadUnalignedSIMD(pFloatImageBump1 + srcTexelOffset + 4 );
			vFloatImageBump1[2] = LoadUnalignedSIMD(pFloatImageBump1 + srcTexelOffset + 8 );
			vFloatImageBump1[3] = LoadUnalignedSIMD(pFloatImageBump1 + srcTexelOffset + 12 );

			vFloatImageBump2[0] = LoadUnalignedSIMD(pFloatImageBump2 + srcTexelOffset );
			vFloatImageBump2[1] = LoadUnalignedSIMD(pFloatImageBump2 + srcTexelOffset + 4 );
			vFloatImageBump2[2] = LoadUnalignedSIMD(pFloatImageBump2 + srcTexelOffset + 8 );
			vFloatImageBump2[3] = LoadUnalignedSIMD(pFloatImageBump2 + srcTexelOffset + 12 );

			vFloatImageBump3[0] = LoadUnalignedSIMD(pFloatImageBump3 + srcTexelOffset );
			vFloatImageBump3[1] = LoadUnalignedSIMD(pFloatImageBump3 + srcTexelOffset + 4 );
			vFloatImageBump3[2] = LoadUnalignedSIMD(pFloatImageBump3 + srcTexelOffset + 8 );
			vFloatImageBump3[3] = LoadUnalignedSIMD(pFloatImageBump3 + srcTexelOffset + 12 );

			// perform an arcane averaging operation upon the bump map values
			// (todo: make this not an inline so it will schedule better -- inlining is 
			//  done by the linker, which is too late for operation scheduling)
			ColorSpace::LinearToBumpedLightmap( vFloatImage[0],	vFloatImageBump1[0],
												vFloatImageBump2[0], vFloatImageBump3[0],
												// transform "in place":
												vFloatImage[0], vFloatImageBump1[0], 
												vFloatImageBump2[0], vFloatImageBump3[0] );
			ColorSpace::LinearToBumpedLightmap( vFloatImage[1],	vFloatImageBump1[1],
												vFloatImageBump2[1], vFloatImageBump3[1],
												// transform "in place":
												vFloatImage[1], vFloatImageBump1[1], 
												vFloatImageBump2[1], vFloatImageBump3[1] );
			ColorSpace::LinearToBumpedLightmap( vFloatImage[2],	vFloatImageBump1[2],
												vFloatImageBump2[2], vFloatImageBump3[2],
												// transform "in place":
												vFloatImage[2], vFloatImageBump1[2], 
												vFloatImageBump2[2], vFloatImageBump3[2] );
			ColorSpace::LinearToBumpedLightmap( vFloatImage[3],	vFloatImageBump1[3],
												vFloatImageBump2[3], vFloatImageBump3[3],
												// transform "in place":
												vFloatImage[3], vFloatImageBump1[3], 
												vFloatImageBump2[3], vFloatImageBump3[3] );
	

			// convert each color to RGB scaled.
			// DO NOT! make this into a for loop. The (April07 XDK) compiler
			// in fact DOES NOT unroll them, and will perform very naive
			// scheduling if you try. 

			// clamp to 0..16 float
			vFloatImage[0]		= MinSIMD(vFloatImage[0], vSixteen);
			vFloatImageBump1[0] = MinSIMD(vFloatImageBump1[0], vSixteen);
			vFloatImageBump2[0] = MinSIMD(vFloatImageBump2[0], vSixteen);
			vFloatImageBump3[0] = MinSIMD(vFloatImageBump3[0], vSixteen);

			vFloatImage[1]		= MinSIMD(vFloatImage[1], vSixteen);
			vFloatImageBump1[1] = MinSIMD(vFloatImageBump1[1], vSixteen);
			vFloatImageBump2[1] = MinSIMD(vFloatImageBump2[1], vSixteen);
			vFloatImageBump3[1] = MinSIMD(vFloatImageBump3[1], vSixteen);

			vFloatImage[2]		= MinSIMD(vFloatImage[2], vSixteen);
			vFloatImageBump1[2] = MinSIMD(vFloatImageBump1[2], vSixteen);
			vFloatImageBump2[2] = MinSIMD(vFloatImageBump2[2], vSixteen);
			vFloatImageBump3[2] = MinSIMD(vFloatImageBump3[2], vSixteen);

			vFloatImage[3]		= MinSIMD(vFloatImage[3], vSixteen);
			vFloatImageBump1[3] = MinSIMD(vFloatImageBump1[3], vSixteen);
			vFloatImageBump2[3] = MinSIMD(vFloatImageBump2[3], vSixteen);
			vFloatImageBump3[3] = MinSIMD(vFloatImageBump3[3], vSixteen);


			// compute the scaling factor, place it in w, and 
			// scale the rest by it. Obliterates whatever was
			// already in alpha.
			// This code is why it is important to not use a for
			// loop: you need to let the compiler keep the value
			// on registers (which it can't do if you use a
			// variable indexed array) and interleave the
			// inlined instructions.

			vFloatImage[0]		= PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImage[0]) );
			vFloatImageBump1[0] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump1[0]) );
			vFloatImageBump2[0] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump2[0]) );
			vFloatImageBump3[0] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump3[0]) );

			vFloatImage[1]		= PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImage[1]) );
			vFloatImageBump1[1] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump1[1]) );
			vFloatImageBump2[1] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump2[1]) );
			vFloatImageBump3[1] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump3[1]) );

			vFloatImage[2]		= PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImage[2]) );
			vFloatImageBump1[2] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump1[2]) );
			vFloatImageBump2[2] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump2[2]) );
			vFloatImageBump3[2] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump3[2]) );

			vFloatImage[3]		= PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImage[3]) );
			vFloatImageBump1[3] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump1[3]) );
			vFloatImageBump2[3] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump2[3]) );
			vFloatImageBump3[3] = PackPixel_BGRA8888( ConvertLightmapColorToRGBScale(vFloatImageBump3[3]) );

			// Each of the registers above contains one RGBA 32-bit struct
			// in their w word. So, combine them such that each of the assignees
			// below contains four RGBAs, in xyzw order (big-endian).

			outBaseMap = __vrlimi(outBaseMap, vFloatImage[0], 8, 3 ); // insert into x
			outBump1 =	 __vrlimi(outBump1, vFloatImageBump1[0], 8, 3 ); // insert into x
			outBump2 =	 __vrlimi(outBump2, vFloatImageBump2[0], 8, 3 ); // insert into x
			outBump3 =	 __vrlimi(outBump3, vFloatImageBump3[0], 8, 3 ); // insert into x

			outBaseMap = __vrlimi(outBaseMap, vFloatImage[1], 4, 2 ); // insert into y
			outBump1 =	 __vrlimi(outBump1, vFloatImageBump1[1], 4, 2 ); // insert into y
			outBump2 =	 __vrlimi(outBump2, vFloatImageBump2[1], 4, 2 ); // insert into y
			outBump3 =	 __vrlimi(outBump3, vFloatImageBump3[1], 4, 2 ); // insert into y

			outBaseMap = __vrlimi(outBaseMap, vFloatImage[2], 2, 1 ); // insert into z
			outBump1 =	 __vrlimi(outBump1, vFloatImageBump1[2], 2, 1 ); // insert into z
			outBump2 =	 __vrlimi(outBump2, vFloatImageBump2[2], 2, 1 ); // insert into z
			outBump3 =	 __vrlimi(outBump3, vFloatImageBump3[2], 2, 1 ); // insert into z

			outBaseMap = __vrlimi(outBaseMap, vFloatImage[3], 1, 0 ); // insert into w
			outBump1 =	 __vrlimi(outBump1, vFloatImageBump1[3], 1, 0 ); // insert into w
			outBump2 =	 __vrlimi(outBump2, vFloatImageBump2[3], 1, 0 ); // insert into w
			outBump3 =	 __vrlimi(outBump3, vFloatImageBump3[3], 1, 0 ); // insert into w

			// push the data through the store-gather buffer.
			storeGather.write(m_LightmapPixelWriter, outBaseMap, outBump1, outBump2, outBump3);

		}

		// Once here, make sure we've committed any leftover changes, then process
		// the remainders singly.
		storeGather.commit(m_LightmapPixelWriter);

		for( ;  // s is where it should be from the loop above
			s < (unsigned int) pLightmapSize[0]; 
			s++, 
				// m_LightmapPixelWriter->SkipBytes(nRewindToNextPixel), // now handled by store-gather
				srcTexelOffset += ( FOUR ))
		{				

			static const fltx4 vSixteen = {16.0f, 16.0f, 16.0f, 16.0f};
			fltx4 vColor[4];
			fltx4 vFloatImage = LoadUnalignedSIMD(&pFloatImage[srcTexelOffset]);
			fltx4 vFloatImageBump1 = LoadUnalignedSIMD(&pFloatImageBump1[srcTexelOffset]);
			fltx4 vFloatImageBump2 = LoadUnalignedSIMD(&pFloatImageBump2[srcTexelOffset]);
			fltx4 vFloatImageBump3 = LoadUnalignedSIMD(&pFloatImageBump3[srcTexelOffset]);

			// perform an arcane averaging operation upon the bump map values
			ColorSpace::LinearToBumpedLightmap( vFloatImage,
				vFloatImageBump1, vFloatImageBump2,
				vFloatImageBump3,
				vColor[0], vColor[1], vColor[2], vColor[3] );		

			// convert each color to RGB scaled.
			// DO NOT! make this into a for loop. The (April07 XDK) compiler
			// in fact DOES NOT unroll them, and will perform very naive
			// scheduling if you try. 

			// clamp to 0..16 float
			vColor[0] = MinSIMD(vColor[0], vSixteen);
			vColor[1] = MinSIMD(vColor[1], vSixteen);
			vColor[2] = MinSIMD(vColor[2], vSixteen);
			vColor[3] = MinSIMD(vColor[3], vSixteen);

			// compute the scaling factor, place it in w, and 
			// scale the rest by it. Obliterates whatever was
			// already in alpha.
			// This code is why it is important to not use a for
			// loop: you need to let the compiler interleave the
			// inlined instructions.
			vColor[0] = ConvertLightmapColorToRGBScale( vColor[0] );
			vColor[1] = ConvertLightmapColorToRGBScale( vColor[1] );
			vColor[2] = ConvertLightmapColorToRGBScale( vColor[2] );
			vColor[3] = ConvertLightmapColorToRGBScale( vColor[3] );


#ifdef X360_DOUBLECHECK_LIGHTMAPS
			unsigned short color[4][4];

			ColorSpace::LinearToBumpedLightmap( &pFloatImage[srcTexelOffset],
				&pFloatImageBump1[srcTexelOffset], &pFloatImageBump2[srcTexelOffset],
				&pFloatImageBump3[srcTexelOffset],
				color[0], color[1], color[2], color[3] );
			unsigned short alpha = ColorSpace::LinearToUnsignedShort( pFloatImage[srcTexelOffset+3], 16 );
			color[0][3] = color[1][3] = color[2][3] = color[3][3] = alpha;

			if( IsX360() )
			{
				for( int i = 0; i != 4; ++i )
				{
					Vector4D vRGBScale;

					vRGBScale.x = color[i][0] * (16.0f / 65535.0f);
					vRGBScale.y = color[i][1] * (16.0f / 65535.0f);
					vRGBScale.z = color[i][2] * (16.0f / 65535.0f);
					vRGBScale = ConvertLightmapColorToRGBScale( &vRGBScale.x );
					color[i][0] = RoundFloatToByte( vRGBScale.x * 255.0f );
					color[i][1] = RoundFloatToByte( vRGBScale.y * 255.0f );
					color[i][2] = RoundFloatToByte( vRGBScale.z * 255.0f );
					color[i][3] = RoundFloatToByte( vRGBScale.w * 255.0f );
				}						
			}

			/*
			for (int ii = 0; ii < 4; ++ii)
			{
				uint32 pack = (PackPixel_BGRA8888( vColor[ii] ).u[3]);
				if (color[ii][3] != 0)
				Assert(	color[ii][0] == (pack & 0xFF0000) >> 16	&& 
						color[ii][1] == (pack & 0xFF00) >> 8		&& 
						color[ii][2] == (pack & 0xFF)				&& 
						color[ii][3] == (pack & 0xFF000000) >> 24 );
			}
			*/

#endif


				vColor[0] = PackPixel_BGRA8888( vColor[0] );
				vColor[1] = PackPixel_BGRA8888( vColor[1] );
				vColor[2] = PackPixel_BGRA8888( vColor[2] );
				vColor[3] = PackPixel_BGRA8888( vColor[3] );

				storeGather.writeJustW(m_LightmapPixelWriter, vColor[0], vColor[1], vColor[2], vColor[3] );

				/* // here is the old way of writing pixels:
				// now we store-gather this
				m_LightmapPixelWriter->WritePixelNoAdvance_BGRA8888( vColor[0] );
				Assert(*reinterpret_cast<unsigned int *>(m_LightmapPixelWriter->GetCurrentPixel()) == PackPixel_BGRA8888( vColor[0] ).u[3] );
				void * RESTRICT pBits = m_LightmapPixelWriter->SkipBytes( nLightmap0WriterSizeBytes );
				m_LightmapPixelWriter->WritePixelNoAdvance_BGRA8888( vColor[1], pBits );
				Assert(*reinterpret_cast<unsigned int *>(m_LightmapPixelWriter->GetCurrentPixel()) == PackPixel_BGRA8888( vColor[1] ).u[3] );
				pBits = m_LightmapPixelWriter->SkipBytes( nLightmap0WriterSizeBytes );
				m_LightmapPixelWriter->WritePixelNoAdvance_BGRA8888( vColor[2], pBits );
				Assert(*reinterpret_cast<unsigned int *>(m_LightmapPixelWriter->GetCurrentPixel()) == PackPixel_BGRA8888( vColor[2] ).u[3] );
				pBits = m_LightmapPixelWriter->SkipBytes( nLightmap0WriterSizeBytes );
				m_LightmapPixelWriter->WritePixelNoAdvance_BGRA8888( vColor[3], pBits );
				Assert(*reinterpret_cast<unsigned int *>(m_LightmapPixelWriter->GetCurrentPixel()) == PackPixel_BGRA8888( vColor[3] ).u[3] );

				m_LightmapPixelWriter->SkipBytes(nRewindToNextPixel);
				*/
		}

		storeGather.commit(m_LightmapPixelWriter);

	}
}

#endif //_X360

// write bumped lightmap update to HDR integer lightmap
void CMatLightmaps::BumpedLightmapBitsToPixelWriter_HDRI( float* RESTRICT pFloatImage, float * RESTRICT pFloatImageBump1, float * RESTRICT pFloatImageBump2, 
												 float * RESTRICT pFloatImageBump3, int pLightmapSize[2], int pOffsetIntoLightmapPage[2], FloatBitMap_t *pfmOut ) RESTRICT
{
	const int nLightmapSize0 = pLightmapSize[0];
	const int nLightmap0WriterSizeBytes = nLightmapSize0 * m_LightmapPixelWriter.GetPixelSize();
	const int nRewindToNextPixel = -( ( nLightmap0WriterSizeBytes * 3 ) - m_LightmapPixelWriter.GetPixelSize() );

	if( m_LightmapPixelWriter.IsUsingFloatFormat() )
	{
		AssertMsg(!IsX360(), "Tried to use a floating-point pixel format for lightmaps on 360, which is not supported.");
		if (!IsX360())
		{
			for( int t = 0; t < pLightmapSize[1]; t++ )
			{
				int srcTexelOffset = ( sizeof( Vector4D ) / sizeof( float ) ) * ( 0 + t * nLightmapSize0 );
				m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );

				for( int s = 0; 
					s < nLightmapSize0; 
					s++, m_LightmapPixelWriter.SkipBytes(nRewindToNextPixel),srcTexelOffset += (sizeof(Vector4D)/sizeof(float)))
				{
					unsigned short color[4][4];

					ColorSpace::LinearToBumpedLightmap( &pFloatImage[srcTexelOffset],
						&pFloatImageBump1[srcTexelOffset], &pFloatImageBump2[srcTexelOffset],
						&pFloatImageBump3[srcTexelOffset],
						color[0], color[1], color[2], color[3] );
					float alpha = pFloatImage[srcTexelOffset+3];
					Assert( alpha >= 0.0f && alpha <= 1.0f );

					if ( HardwareConfig()->GetCSMAccurateBlending() )
					{
						float alphaF[4];

						ColorSpace::LinearToBumpedLightmapAlpha( &pFloatImage[srcTexelOffset + 3],
																 &pFloatImageBump1[srcTexelOffset + 3], &pFloatImageBump2[srcTexelOffset + 3], &pFloatImageBump3[srcTexelOffset + 3],
																 &alphaF[0], &alphaF[1], &alphaF[2], &alphaF[3] );

						unsigned short alphaUS[4];
						alphaUS[0] = ColorSpace::LinearToUnsignedShort( alphaF[0], 16 );
						alphaUS[1] = ColorSpace::LinearToUnsignedShort( alphaF[1], 16 );
						alphaUS[2] = ColorSpace::LinearToUnsignedShort( alphaF[2], 16 );
						alphaUS[3] = ColorSpace::LinearToUnsignedShort( alphaF[3], 16 );

						color[0][3] = alphaUS[0];
						color[1][3] = alphaUS[1];
						color[2][3] = alphaUS[2];
						color[3][3] = alphaUS[3];
					}
					else
					{
						color[0][3] = color[1][3] = color[2][3] = color[3][3] = alpha;
					}

					float toFloat = ( 1.0f / ( float )( 1 << 16 ) );

					/* // This code is now a can't-happen, because we do not allow float formats on 360.
#if ( defined( USE_32BIT_LIGHTMAPS_ON_360 ) )
					if( IsX360() )
					{
						for( int i = 0; i != 4; ++i )
						{
							Vector4D vRGBScale;

							vRGBScale.x = color[i][0] * (16.0f / 65535.0f);
							vRGBScale.y = color[i][1] * (16.0f / 65535.0f);
							vRGBScale.z = color[i][2] * (16.0f / 65535.0f);
							vRGBScale = ConvertLightmapColorToRGBScale( &vRGBScale.x );
							color[i][0] = RoundFloatToByte( vRGBScale.x * 255.0f );
							color[i][1] = RoundFloatToByte( vRGBScale.y * 255.0f );
							color[i][2] = RoundFloatToByte( vRGBScale.z * 255.0f );
							color[i][3] = RoundFloatToByte( vRGBScale.w * 255.0f );
						}

						toFloat = ( 1.0f / ( float )( 1 << 8 ) );
					}
#endif
					*/

					m_LightmapPixelWriter.WritePixelNoAdvanceF( toFloat * color[0][0], toFloat * color[0][1], toFloat * color[0][2], toFloat * color[0][3] );

					m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
					m_LightmapPixelWriter.WritePixelNoAdvanceF( toFloat * color[1][0], toFloat * color[1][1], toFloat * color[1][2], toFloat * color[1][3] );

					m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
					m_LightmapPixelWriter.WritePixelNoAdvanceF( toFloat * color[2][0], toFloat * color[2][1], toFloat * color[2][2], toFloat * color[2][3] );

					m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
					m_LightmapPixelWriter.WritePixelNoAdvanceF( toFloat * color[3][0], toFloat * color[3][1], toFloat * color[3][2], toFloat * color[3][3] );
				}
			}
		}
	}
	else
	{
#ifndef X360_USE_SIMD_LIGHTMAP
		for( int t = 0; t < pLightmapSize[1]; t++ )
		{
			int srcTexelOffset = ( sizeof( Vector4D ) / sizeof( float ) ) * ( 0 + t * nLightmapSize0 );
			m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );

			for( int s = 0; 
				s < nLightmapSize0; 
				s++, m_LightmapPixelWriter.SkipBytes(nRewindToNextPixel),srcTexelOffset += (sizeof(Vector4D)/sizeof(float)))
			{					
				unsigned short color[4][4];

				ColorSpace::LinearToBumpedLightmap( &pFloatImage[srcTexelOffset],
					&pFloatImageBump1[srcTexelOffset], &pFloatImageBump2[srcTexelOffset],
					&pFloatImageBump3[srcTexelOffset],
					color[0], color[1], color[2], color[3] );

				if ( HardwareConfig()->GetCSMAccurateBlending() )
				{
					float alpha[4];
					ColorSpace::LinearToBumpedLightmapAlpha( &pFloatImage[srcTexelOffset + 3],
															 &pFloatImageBump1[srcTexelOffset + 3], &pFloatImageBump2[srcTexelOffset + 3], &pFloatImageBump3[srcTexelOffset + 3],
															 &alpha[0], &alpha[1], &alpha[2], &alpha[3] );

					unsigned short alphaUS[4];
					alphaUS[0] = ColorSpace::LinearToUnsignedShort( alpha[0], 16 );
					alphaUS[1] = ColorSpace::LinearToUnsignedShort( alpha[1], 16 );
					alphaUS[2] = ColorSpace::LinearToUnsignedShort( alpha[2], 16 );
					alphaUS[3] = ColorSpace::LinearToUnsignedShort( alpha[3], 16 );

					color[0][3] = alphaUS[0];
					color[1][3] = alphaUS[1];
					color[2][3] = alphaUS[2];
					color[3][3] = alphaUS[3];
				}
				else
				{
					unsigned short alpha = ColorSpace::LinearToUnsignedShort( pFloatImage[srcTexelOffset+3], 16 );
					color[0][3] = color[1][3] = color[2][3] = color[3][3] = alpha;
				}

#if ( defined( USE_32BIT_LIGHTMAPS_ON_360 ) )
				if( IsX360() )
				{
					for( int i = 0; i != 4; ++i )
					{
						Vector4D vRGBScale;

						vRGBScale.x = color[i][0] * (16.0f / 65535.0f);
						vRGBScale.y = color[i][1] * (16.0f / 65535.0f);
						vRGBScale.z = color[i][2] * (16.0f / 65535.0f);
						vRGBScale = ConvertLightmapColorToRGBScale( &vRGBScale.x );
						color[i][0] = RoundFloatToByte( vRGBScale.x * 255.0f );
						color[i][1] = RoundFloatToByte( vRGBScale.y * 255.0f );
						color[i][2] = RoundFloatToByte( vRGBScale.z * 255.0f );
						color[i][3] = RoundFloatToByte( vRGBScale.w * 255.0f );
					}						
				}
#endif
				m_LightmapPixelWriter.WritePixelNoAdvance( color[0][0], color[0][1], color[0][2], color[0][3] );

				m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
				m_LightmapPixelWriter.WritePixelNoAdvance( color[1][0], color[1][1], color[1][2], color[1][3] );

				m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
				m_LightmapPixelWriter.WritePixelNoAdvance( color[2][0], color[2][1], color[2][2], color[2][3] );

				m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
				m_LightmapPixelWriter.WritePixelNoAdvance( color[3][0], color[3][1], color[3][2], color[3][3] );

				// Write data to the bitmapped represenations so that PFM files can be written
				if ( pfmOut )
				{
					PixRGBAF pixelData;
					pixelData.Red = color[0][0];                  
					pixelData.Green = color[0][1];                  
					pixelData.Blue = color[0][2];
					pixelData.Alpha = color[0][3];
					pfmOut->WritePixelRGBAF(pOffsetIntoLightmapPage[0] + s, pOffsetIntoLightmapPage[1] + t, 0, pixelData);
				}
			}
		}
#else
		// this is an optimized XBOX implementation. For a clearer
		// presentation of the algorithm, see the PC implementation
		// above.
		// First check for the most common case, using an efficient
		// branch rather than a switch:
		if (m_LightmapPixelWriter.GetFormat() == IMAGE_FORMAT_LINEAR_BGRA8888)
		{
			// broken out into a static to make things more readable
			// and be nicer to the instruction cache
			BumpedLightmapBitsToPixelWriter_HDRI_BGRA_X360( pFloatImage, pFloatImageBump1, pFloatImageBump2, 
				pFloatImageBump3, pLightmapSize, pOffsetIntoLightmapPage, pfmOut, &m_LightmapPixelWriter );
		}
		else
		{
			// This case is used in Portal 2 to fill RGBA16161616 lightmaps
			Assert( m_LightmapPixelWriter.GetPixelSize() == 8 );

			for( int t = 0; t < pLightmapSize[1]; t++ )
			{
				// assert that 1 * 4 = 4 
				COMPILE_TIME_ASSERT(sizeof( Vector4D ) == sizeof(float) * 4); 
#define	FOUR (sizeof( Vector4D ) / sizeof( float ))  // in case this ever changes
				int srcTexelOffset = ( FOUR ) * ( 0 + t * nLightmapSize0 );
				m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );

				for( int s = 0; 
					s < nLightmapSize0; 
					s++, m_LightmapPixelWriter.SkipBytes(nRewindToNextPixel),srcTexelOffset += ( FOUR ))
				{				

					static const fltx4 vSixteen = {16.0f, 16.0f, 16.0f, 16.0f};
					fltx4 vColor[4];
					fltx4 vFloatImage = LoadUnalignedSIMD(&pFloatImage[srcTexelOffset]);
					fltx4 vFloatImageBump1 = LoadUnalignedSIMD(&pFloatImageBump1[srcTexelOffset]);
					fltx4 vFloatImageBump2 = LoadUnalignedSIMD(&pFloatImageBump2[srcTexelOffset]);
					fltx4 vFloatImageBump3 = LoadUnalignedSIMD(&pFloatImageBump3[srcTexelOffset]);
					
					// perform an arcane averaging operation upon the bump map values
					ColorSpace::LinearToBumpedLightmap( vFloatImage,
						vFloatImageBump1, vFloatImageBump2,
						vFloatImageBump3,
						vColor[0], vColor[1], vColor[2], vColor[3] );		

					// convert each color to RGB scaled.
					// DO NOT! make this into a for loop. The (April07 XDK) compiler
					// in fact DOES NOT unroll them, and will perform very naive
					// scheduling if you try. 

					// clamp to 0..16 float
					vColor[0] = MinSIMD(vColor[0], vSixteen);
					vColor[1] = MinSIMD(vColor[1], vSixteen);
					vColor[2] = MinSIMD(vColor[2], vSixteen);
					vColor[3] = MinSIMD(vColor[3], vSixteen);

					// Not doing the following anymore. This path is for writing 16161616 int lightmaps.
					/*
					// compute the scaling factor, transform the RGB,
					// and place the scale in w. Obliterates whatever was
					// already in alpha.
					// This code is why it is important to not use a for
					// loop: you need to let the compiler interleave the
					// inlined instructions.
					vColor[0] = ConvertLightmapColorToRGBScale( vColor[0] );
					vColor[1] = ConvertLightmapColorToRGBScale( vColor[1] );
					vColor[2] = ConvertLightmapColorToRGBScale( vColor[2] );
					vColor[3] = ConvertLightmapColorToRGBScale( vColor[3] );
					*/

					m_LightmapPixelWriter.WritePixelNoAdvance( vColor[0] );
					m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
					m_LightmapPixelWriter.WritePixelNoAdvance( vColor[1] );
					m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
					m_LightmapPixelWriter.WritePixelNoAdvance( vColor[2] );
					m_LightmapPixelWriter.SkipBytes( nLightmap0WriterSizeBytes );
					m_LightmapPixelWriter.WritePixelNoAdvance( vColor[3] );

					AssertMsg(!pfmOut, "Runtime conversion of lightmaps to files is no longer supported on 360.\n");

					// Write data to the bitmapped represenations so that PFM files can be written
					if ( pfmOut )
					{
						Warning("**************************************************\n"
								"Lightmap output to files on 360 HAS BEEN DISABLED.\n"
								"A grave error has just occurred.\n"
								"**************************************************\n");
						DebuggerBreakIfDebugging();
						/*
						PixRGBAF pixelData;
						pixelData.Red = color[0][0];                  
						pixelData.Green = color[0][1];                  
						pixelData.Blue = color[0][2];
						pixelData.Alpha = alpha;
						pfmOut->WritePixelRGBAF(pOffsetIntoLightmapPage[0] + s, pOffsetIntoLightmapPage[1] + t, pixelData);
						*/
					}
				}
			}
		}
#endif
	}
}


void CMatLightmaps::LightmapBitsToPixelWriter_LDR( float* pFloatImage, int pLightmapSize[2], int pOffsetIntoLightmapPage[2], FloatBitMap_t *pfmOut )
{
	// non-HDR lightmap processing
	float *pSrc = pFloatImage;
	for( int t = 0; t < pLightmapSize[1]; ++t )
	{
		m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );
		for( int s = 0; s < pLightmapSize[0]; ++s, pSrc += (sizeof(Vector4D)/sizeof(*pSrc)) )
		{
			unsigned char color[4];
			ColorSpace::LinearToLightmap( color, pSrc );

			if ( HardwareConfig()->GetCSMAccurateBlending() )
			{
				ColorSpace::LinearToLightmapAlpha( &color[3], pSrc[3] );
			}
			else
			{
				color[3] = RoundFloatToByte( pSrc[3] * 255.0f );
			}

			m_LightmapPixelWriter.WritePixel( color[0], color[1], color[2], color[3] );

			if ( pfmOut )
			{
				// Write data to the bitmapped represenations so that PFM files can be written
				PixRGBAF pixelData;
				pixelData.Red = color[0];                  
				pixelData.Green = color[1];                  
				pixelData.Blue = color[2];
				pixelData.Alpha = color[3];
				pfmOut->WritePixelRGBAF( pOffsetIntoLightmapPage[0] + s, pOffsetIntoLightmapPage[1] + t, 0, pixelData );
			}
		}
	}
}


void CMatLightmaps::LightmapBitsToPixelWriter_HDRF( float* pFloatImage, int pLightmapSize[2], int pOffsetIntoLightmapPage[2], FloatBitMap_t *pfmOut )
{
	if ( IsX360() )
	{
		// 360 does not support HDR float 
		Assert( 0 );
		return;
	}

	// float HDR lightmap processing
	float *pSrc = pFloatImage;
	for ( int t = 0; t < pLightmapSize[1]; ++t )
	{
		m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );

		if ( HardwareConfig()->GetCSMAccurateBlending() )
		{
			ColorSpace::LinearToLightmapAlpha( &pSrc[3] );
		}

		for ( int s = 0; s < pLightmapSize[0]; ++s, pSrc += (sizeof(Vector4D)/sizeof(*pSrc)) )
		{
			m_LightmapPixelWriter.WritePixelF( pSrc[0], pSrc[1], pSrc[2], pSrc[3] );
		}
	}
}

// numbers come in on the domain [0..16]
void CMatLightmaps::LightmapBitsToPixelWriter_HDRI( float* RESTRICT pFloatImage, int pLightmapSize[2], int pOffsetIntoLightmapPage[2], FloatBitMap_t * RESTRICT pfmOut )
{
#ifndef X360_USE_SIMD_LIGHTMAP
	// PC code (and old, pre-SIMD xbox version -- unshippably slow)
	if ( m_LightmapPixelWriter.IsUsingFloatFormat() )
	{
		// integer HDR lightmap processing
		float *pSrc = pFloatImage;
		for ( int t = 0; t < pLightmapSize[1]; ++t )
		{
			m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );
			for ( int s = 0; s < pLightmapSize[0]; ++s, pSrc += (sizeof(Vector4D)/sizeof(*pSrc)) )
			{
				int r, g, b, a;

				r = ColorSpace::LinearFloatToCorrectedShort( pSrc[0] );
				g = ColorSpace::LinearFloatToCorrectedShort( pSrc[1] );
				b = ColorSpace::LinearFloatToCorrectedShort( pSrc[2] );

				if ( HardwareConfig()->GetCSMAccurateBlending() )
				{
					ColorSpace::LinearToLightmapAlpha( &a, pSrc[3] );
				}
				else
				{
					a = ColorSpace::LinearToUnsignedShort( pSrc[3], 16 );
				}

				float toFloat = ( 1.0f / ( float )( 1 << 16 ) );

#if ( defined( USE_32BIT_LIGHTMAPS_ON_360 ) )
				if( IsX360() )
				{
					Vector4D vRGBScale;

					vRGBScale.x = r * (16.0f / 65535.0f);
					vRGBScale.y = g * (16.0f / 65535.0f);
					vRGBScale.z = b * (16.0f / 65535.0f);
					vRGBScale = ConvertLightmapColorToRGBScale( &vRGBScale.x );

					r = RoundFloatToByte( vRGBScale.x * 255.0f );
					g = RoundFloatToByte( vRGBScale.y * 255.0f );
					b = RoundFloatToByte( vRGBScale.z * 255.0f );
					a = RoundFloatToByte( vRGBScale.w * 255.0f );

					toFloat = ( 1.0f / ( float )( 1 << 8 ) );
				}

#endif
				Assert( pSrc[3] >= 0.0f && pSrc[3] <= 1.0f );
				m_LightmapPixelWriter.WritePixelF( r * toFloat, g * toFloat, b * toFloat, pSrc[3] );
			}
		}
	}
	else
	{
		// integer HDR lightmap processing
		float *pSrc = pFloatImage;
		for ( int t = 0; t < pLightmapSize[1]; ++t )
		{
			m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );
			for ( int s = 0; s < pLightmapSize[0]; ++s, pSrc += (sizeof(Vector4D)/sizeof(*pSrc)) )
			{
				int r, g, b, a;

				r = ColorSpace::LinearFloatToCorrectedShort( pSrc[0] );
				g = ColorSpace::LinearFloatToCorrectedShort( pSrc[1] );
				b = ColorSpace::LinearFloatToCorrectedShort( pSrc[2] );

				if ( HardwareConfig()->GetCSMAccurateBlending() )
				{
					ColorSpace::LinearToLightmapAlpha( &a, pSrc[3] );
				}
				else
				{
					a = ColorSpace::LinearToUnsignedShort( pSrc[3], 16 );
				}

#if ( defined( USE_32BIT_LIGHTMAPS_ON_360 ) )
				if( IsX360() )
				{
					Vector4D vRGBScale;

					vRGBScale.x = r * (16.0f / 65535.0f);
					vRGBScale.y = g * (16.0f / 65535.0f);
					vRGBScale.z = b * (16.0f / 65535.0f);
					vRGBScale = ConvertLightmapColorToRGBScale( &vRGBScale.x );

					r = RoundFloatToByte( vRGBScale.x * 255.0f );
					g = RoundFloatToByte( vRGBScale.y * 255.0f );
					b = RoundFloatToByte( vRGBScale.z * 255.0f );
					a = RoundFloatToByte( vRGBScale.w * 255.0f );
				}
#endif
				m_LightmapPixelWriter.WritePixel( r, g, b, a );

				if ( pfmOut )
				{
					// Write data to the bitmapped represenations so that PFM files can be written
					PixRGBAF pixelData;
					pixelData.Red = pSrc[0];                  
					pixelData.Green = pSrc[1];                  
					pixelData.Blue = pSrc[2];
					pixelData.Alpha = pSrc[3];
					pfmOut->WritePixelRGBAF( pOffsetIntoLightmapPage[0] + s, pOffsetIntoLightmapPage[1] + t, 0, pixelData );
				}				
			}
		}
	}
#else
	// XBOX360 code
	if ( m_LightmapPixelWriter.IsUsingFloatFormat() )
	{
		if( IsX360() )
		{
			AssertMsg( false, "Float-format pixel writers do not exist on x360." );
		}
		else
		{	// This code is here as an example only, in case floating point
			// format is restored to 360.

			// integer HDR lightmap processing
			float * RESTRICT pSrc = pFloatImage;
			for ( int t = 0; t < pLightmapSize[1]; ++t )
			{
				m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );
				for ( int s = 0; s < pLightmapSize[0]; ++s, pSrc += (sizeof(Vector4D)/sizeof(*pSrc)) )
				{
					int r, g, b, a;

					r = ColorSpace::LinearFloatToCorrectedShort( pSrc[0] );
					g = ColorSpace::LinearFloatToCorrectedShort( pSrc[1] );
					b = ColorSpace::LinearFloatToCorrectedShort( pSrc[2] );
					a = ColorSpace::LinearToUnsignedShort( pSrc[3], 16 );

					float toFloat = ( 1.0f / ( float )( 1 << 16 ) );

#if ( defined( USE_32BIT_LIGHTMAPS_ON_360 ) )
					if( IsX360() )
					{
						Vector4D vRGBScale;

						vRGBScale.x = r * (16.0f / 65535.0f);
						vRGBScale.y = g * (16.0f / 65535.0f);
						vRGBScale.z = b * (16.0f / 65535.0f);
						vRGBScale = ConvertLightmapColorToRGBScale( &vRGBScale.x );

						r = RoundFloatToByte( vRGBScale.x * 255.0f );
						g = RoundFloatToByte( vRGBScale.y * 255.0f );
						b = RoundFloatToByte( vRGBScale.z * 255.0f );
						a = RoundFloatToByte( vRGBScale.w * 255.0f );

						toFloat = ( 1.0f / ( float )( 1 << 8 ) );
					}

#endif
					Assert( pSrc[3] >= 0.0f && pSrc[3] <= 1.0f );
					m_LightmapPixelWriter.WritePixelF( r * toFloat, g * toFloat, b * toFloat, pSrc[3] );
				}
			}
		}
	}
	else
	{
		// This is the fast X360 pathway.

		// integer HDR lightmap processing
		float * RESTRICT pSrc = pFloatImage;
		// Assert((reinterpret_cast<unsigned int>(pSrc) & 15) == 0); // 16-byte aligned?
		COMPILE_TIME_ASSERT(sizeof(Vector4D)/sizeof(*pSrc) == 4); // assert that 1 * 4 = 4

		// input numbers from pSrc are on the domain [0..+inf]
		// we clamp them to the range [0..16]
		// output is RGBA 
		// the shader does this: rOut = Rin * Ain * 16.0f 
		// where Rin is [0..1], a float computed from a byte value [0..255]
		// Ain is therefore the brightest channel (say R) divided by 16 and quantized
		// Rin is computed from pSrc->r by dividing by Ain
		
		// rather than switching inside WritePixel for each different format,
		// thus causing a 23-cycle pipeline clear for every pixel, we'll
		// branch on the format here. That will allow us to unroll the inline
		// pixel write functions differently depending on their different 
		// latencies. 

		Assert(!pfmOut); // should never happen on 360.
#ifndef ALLOW_PFM_OUTPUT_ON_360
		if ( pfmOut )
		{
			Warning("*****************************************\n"
					"Lightmap output on 360 HAS BEEN DISABLED.\n"
					"A grave error has just occurred.\n"
					"*****************************************\n");
		}
#endif

		// switch once, here, outside the loop, rather than
		// switching inside each pixel. Switches are not fast
		// on x360: they are usually implemented as jumps 
		// through function tables, which have a 24-cycle
		// stall. 
		switch (m_LightmapPixelWriter.GetFormat())
		{
			// note: format names are low-order-byte first. 
		case IMAGE_FORMAT_RGBA8888:
		case IMAGE_FORMAT_LINEAR_RGBA8888:
		{
			for ( int t = 0; t < pLightmapSize[1]; ++t )
			{
				m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );
				for ( int s = 0; s < pLightmapSize[0]; ++s, pSrc += 4 )
				{	
					static const fltx4 vSixteen = {16.0f, 16.0f, 16.0f, 16.0f};
					fltx4 rgba = LoadUnalignedSIMD(pSrc);

					// clamp to 0..16 float
					rgba = MinSIMD(rgba, vSixteen);
					// compute the scaling factor, place it in w, and 
					// scale the rest by it.
					rgba = ConvertLightmapColorToRGBScale( rgba );
					// rgba is now  float 0..255 in each component
					m_LightmapPixelWriter.WritePixelNoAdvance_RGBA8888(rgba);


					/*  // not supported on X360
					if ( pfmOut )
					{
						// Write data to the bitmapped represenations so that PFM files can be written
						PixRGBAF pixelData;
						XMStoreVector4(&pixelData,rgba);
						pfmOut->WritePixelRGBAF( pOffsetIntoLightmapPage[0] + s, pOffsetIntoLightmapPage[1] + t, pixelData );
					}			
					*/
				}
			}
			break;
		}

		case IMAGE_FORMAT_BGRA8888: // NOTE! : the low order bits are first in this naming convention.
		case IMAGE_FORMAT_LINEAR_BGRA8888:
		{			
			for ( int t = 0; t < pLightmapSize[1]; ++t )
			{
				m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );
				for ( int s = 0; s < pLightmapSize[0]; ++s, pSrc += 4 )
				{	
					static const fltx4 vSixteen = {16.0f, 16.0f, 16.0f, 16.0f};
					fltx4 rgba = LoadUnalignedSIMD(pSrc);

					// clamp to 0..16 float
					rgba = MinSIMD(rgba, vSixteen);
					// compute the scaling factor, place it in w, and 
					// scale the rest by it.
					rgba = ConvertLightmapColorToRGBScale( rgba );
					// rgba is now  float 0..255 in each component
					m_LightmapPixelWriter.WritePixelNoAdvance_BGRA8888(rgba);
					// forcibly advance
					m_LightmapPixelWriter.SkipBytes(4);

					/* // not supported on X360
					if ( pfmOut )
					{
						// Write data to the bitmapped represenations so that PFM files can be written
						PixRGBAF pixelData;
						XMStoreVector4(&pixelData,rgba);
						pfmOut->WritePixelRGBAF( pOffsetIntoLightmapPage[0] + s, pOffsetIntoLightmapPage[1] + t, pixelData );
					}			
					*/
				}
			}
			break;
		}

		case IMAGE_FORMAT_RGBA16161616:
		case IMAGE_FORMAT_LINEAR_RGBA16161616:
			{
				for ( int t = 0; t < pLightmapSize[1]; ++t )
				{
					m_LightmapPixelWriter.Seek( pOffsetIntoLightmapPage[0], pOffsetIntoLightmapPage[1] + t );
					for ( int s = 0; s < pLightmapSize[0]; ++s, pSrc += 4 )
					{	
						static const fltx4 vSixteen = {16.0f, 16.0f, 16.0f, 16.0f};
						fltx4 rgba = LoadUnalignedSIMD(pSrc);
						rgba = MinSIMD(rgba, vSixteen);	// clamp to 0..16 float
						m_LightmapPixelWriter.WritePixelNoAdvance_RGBA16161616(rgba);
						m_LightmapPixelWriter.SkipBytes(8);
					}
				}
				break;
			}

		default:
			AssertMsg1(false,"Unsupported pixel format %d while writing lightmaps!", m_LightmapPixelWriter.GetFormat() );
			Warning("Unsupported pixel format used in lightmap. Lightmaps could not be downloaded.\n");
			break;
		}
	}
#endif
}

void CMatLightmaps::BeginUpdateLightmaps( void )
{
	CMatCallQueue *pCallQueue = GetMaterialSystem()->GetRenderContextInternal()->GetCallQueueInternal();
	if ( pCallQueue )
	{
		pCallQueue->QueueCall( this, &CMatLightmaps::BeginUpdateLightmaps );
		return;
	}

	m_nUpdatingLightmapsStackDepth++;
}

void CMatLightmaps::EndUpdateLightmaps( void )
{
	CMatCallQueue *pCallQueue = GetMaterialSystem()->GetRenderContextInternal()->GetCallQueueInternal();
	if ( pCallQueue )
	{
		pCallQueue->QueueCall( this, &CMatLightmaps::EndUpdateLightmaps );
		return;
	}

	m_nUpdatingLightmapsStackDepth--;
	Assert( m_nUpdatingLightmapsStackDepth >= 0 );
	if( m_nUpdatingLightmapsStackDepth <= 0 && m_nLockedLightmap != -1 )
	{
		g_pShaderAPI->TexUnlock();
		m_nLockedLightmap = -1;
	}
}

int CMatLightmaps::AllocateDynamicLightmap( int lightmapSize[2], int *pOutOffsetIntoPage, int frameID )
{
	// check frameID, fail if current
	for ( int i = 0; i < COUNT_DYNAMIC_LIGHTMAP_PAGES; i++ )
	{
		int dynamicIndex = (m_dynamic.currentDynamicIndex + i) % COUNT_DYNAMIC_LIGHTMAP_PAGES;
		int lightmapPageIndex = m_firstDynamicLightmap + dynamicIndex;
		if ( m_dynamic.lightmapLockFrame[dynamicIndex] != frameID )
		{
			m_dynamic.lightmapLockFrame[dynamicIndex] = frameID;
			m_dynamic.imagePackers[dynamicIndex].Reset( 0, m_pLightmapPages[lightmapPageIndex].m_Width, m_pLightmapPages[lightmapPageIndex].m_Height );
		}

		if ( m_dynamic.imagePackers[dynamicIndex].AddBlock( lightmapSize[0], lightmapSize[1], &pOutOffsetIntoPage[0], &pOutOffsetIntoPage[1] ) )
		{
			return lightmapPageIndex;
		}
	}
	
	return -1;
}

//-----------------------------------------------------------------------------
// Updates the lightmap
//-----------------------------------------------------------------------------
void CMatLightmaps::UpdateLightmap( int lightmapPageID, int lightmapSize[2],
									  int offsetIntoLightmapPage[2], 
									  float *pFloatImage, float *pFloatImageBump1,
									  float *pFloatImageBump2, float *pFloatImageBump3 )
{
	VPROF( "CMatRenderContext::UpdateLightmap" );

	bool hasBump = false;
	int uSize = 1;
	FloatBitMap_t *pfmOut = NULL;
	if ( pFloatImageBump1 && pFloatImageBump2 && pFloatImageBump3 )
	{
		hasBump = true;
		uSize = 4;
	}

	if ( lightmapPageID >= GetNumLightmapPages() || lightmapPageID < 0 )
	{
		Error( "MaterialSystem_Interface_t::UpdateLightmap lightmapPageID=%d out of range\n", lightmapPageID );
		return;
	}
	bool bDynamic = IsDynamicLightmap(lightmapPageID);

	if ( bDynamic )
	{
		int dynamicIndex = lightmapPageID-m_firstDynamicLightmap;
		Assert(dynamicIndex < COUNT_DYNAMIC_LIGHTMAP_PAGES);
		m_dynamic.currentDynamicIndex = (dynamicIndex + 1) % COUNT_DYNAMIC_LIGHTMAP_PAGES;
	}

	if ( mat_lightmap_pfms.GetBool())
	{
		// Allocate and initialize lightmap data that will be written to a PFM file
		if (NULL == m_pLightmapDataPtrArray[lightmapPageID])
		{
			m_pLightmapDataPtrArray[lightmapPageID] = new FloatBitMap_t(m_pLightmapPages[lightmapPageID].m_Width, m_pLightmapPages[lightmapPageID].m_Height);
			m_pLightmapDataPtrArray[lightmapPageID]->Clear(0, 0, 0, 1);
		}
		pfmOut = m_pLightmapDataPtrArray[lightmapPageID];
	}

	// NOTE: Change how the lock is taking place if you ever change how bumped
	// lightmaps are put into the page. Right now, we assume that they're all
	// added to the right of the original lightmap.
	bool bLockSubRect;
	{
		VPROF_( "Locking lightmaps", 2, VPROF_BUDGETGROUP_DLIGHT_RENDERING, false, 0 ); // vprof scope

		bLockSubRect = m_nUpdatingLightmapsStackDepth <= 0 && !bDynamic;
		if( bLockSubRect )
		{
			VPROF_INCREMENT_COUNTER( "lightmap subrect texlock", 1 );
			g_pShaderAPI->ModifyTexture( m_LightmapPageTextureHandles[lightmapPageID] );
			if (!g_pShaderAPI->TexLock( 0, 0, offsetIntoLightmapPage[0], offsetIntoLightmapPage[1],
				lightmapSize[0] * uSize, lightmapSize[1], m_LightmapPixelWriter ))
			{
				return;
			}
		}
		else if( lightmapPageID != m_nLockedLightmap )
		{
			if ( !LockLightmap( lightmapPageID ) )
			{
				ExecuteNTimes( 10, Warning( "Failed to lock lightmap\n" ) );
				return;
			}
		}
	}

	int subRectOffset[2] = {0,0};

	{
		// account for the part spent in math:
		VPROF_( "LightmapBitsToPixelWriter", 2, VPROF_BUDGETGROUP_DLIGHT_RENDERING, false, 0 );
#ifdef _PS3
		// PS3 uses 16-bit half floats per channel...but the HDR_TYPE_FLOAT codepath has a lot of other assumptions, so just
		// lie about the format right here on PS3 only
		if ( hasBump )
		{
			BumpedLightmapBitsToPixelWriter_HDRF( pFloatImage, pFloatImageBump1, pFloatImageBump2, pFloatImageBump3, 
				lightmapSize, bLockSubRect ? subRectOffset : offsetIntoLightmapPage, pfmOut );
		}
		else
		{
			LightmapBitsToPixelWriter_HDRF( pFloatImage, lightmapSize, bLockSubRect ? subRectOffset : offsetIntoLightmapPage, pfmOut );
		}
#else // _PS3
		if ( hasBump )
		{
			switch( HardwareConfig()->GetHDRType() )
			{
			case HDR_TYPE_NONE:
				BumpedLightmapBitsToPixelWriter_LDR( pFloatImage, pFloatImageBump1, pFloatImageBump2, pFloatImageBump3, 
					lightmapSize, bLockSubRect ? subRectOffset : offsetIntoLightmapPage, pfmOut );
				break;
			case HDR_TYPE_INTEGER:
				BumpedLightmapBitsToPixelWriter_HDRI( pFloatImage, pFloatImageBump1, pFloatImageBump2, pFloatImageBump3, 
					lightmapSize, bLockSubRect ? subRectOffset : offsetIntoLightmapPage, pfmOut );
				break;
			case HDR_TYPE_FLOAT:
				BumpedLightmapBitsToPixelWriter_HDRF( pFloatImage, pFloatImageBump1, pFloatImageBump2, pFloatImageBump3, 
					lightmapSize, bLockSubRect ? subRectOffset : offsetIntoLightmapPage, pfmOut );
				break;
			}
		}
		else
		{
			switch ( HardwareConfig()->GetHDRType() )
			{
			case HDR_TYPE_NONE:
				LightmapBitsToPixelWriter_LDR( pFloatImage, lightmapSize, bLockSubRect ? subRectOffset : offsetIntoLightmapPage, pfmOut );
				break;

			case HDR_TYPE_INTEGER:
				LightmapBitsToPixelWriter_HDRI( pFloatImage, lightmapSize, bLockSubRect ? subRectOffset : offsetIntoLightmapPage, pfmOut );
				break;

			case HDR_TYPE_FLOAT:
				LightmapBitsToPixelWriter_HDRF( pFloatImage, lightmapSize, bLockSubRect ? subRectOffset : offsetIntoLightmapPage, pfmOut );
				break;

			default:
				Assert( 0 );
				break;
			}
		}
#endif // !_PS3
	}

	if( bLockSubRect )
	{
		VPROF_( "Unlocking Lightmaps", 2, VPROF_BUDGETGROUP_DLIGHT_RENDERING, false, 0 );
		g_pShaderAPI->TexUnlock();
	}
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
int	CMatLightmaps::GetNumSortIDs( void )
{
	return m_numSortIDs;
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
void CMatLightmaps::ComputeSortInfo( MaterialSystem_SortInfo_t* pInfo, int& sortId, bool alpha )
{
	int lightmapPageID;

	for ( MaterialHandle_t i = GetMaterialDict()->FirstMaterial(); i != GetMaterialDict()->InvalidMaterial(); i = GetMaterialDict()->NextMaterial(i) )
	{
		IMaterialInternal* pMaterial = GetMaterialInternal(i);

		if ( pMaterial->GetMinLightmapPageID() > pMaterial->GetMaxLightmapPageID() )
		{
			continue;
		}
		
		//	const IMaterialVar *pTransVar = pMaterial->GetMaterialProperty( MATERIAL_PROPERTY_OPACITY );
		//	if( ( !alpha && ( pTransVar->GetIntValue() == MATERIAL_TRANSLUCENT ) ) ||
		//		( alpha && !( pTransVar->GetIntValue() == MATERIAL_TRANSLUCENT ) ) )
		//	{
		//		return true;
		//	}

	
//		Warning( "sort stuff: %s %s\n", material->GetName(), bAlpha ? "alpha" : "not alpha" );
		
		// fill in the lightmapped materials
		for ( lightmapPageID = pMaterial->GetMinLightmapPageID(); 
			 lightmapPageID <= pMaterial->GetMaxLightmapPageID(); ++lightmapPageID )
		{
			pInfo[sortId].material = pMaterial->GetQueueFriendlyVersion();
			pInfo[sortId].lightmapPageID = lightmapPageID;
#if 0
			char buf[128];
			Q_snprintf( buf, sizeof( buf ), "ComputeSortInfo: %s lightmapPageID: %d sortID: %d\n", pMaterial->GetName(), lightmapPageID, sortId );
			OutputDebugString( buf );
#endif
			++sortId;
		}
	}
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
void CMatLightmaps::ComputeWhiteLightmappedSortInfo( MaterialSystem_SortInfo_t* pInfo, int& sortId, bool alpha )
{
	for (MaterialHandle_t i = GetMaterialDict()->FirstMaterial(); i != GetMaterialDict()->InvalidMaterial(); i = GetMaterialDict()->NextMaterial(i) )
	{
		IMaterialInternal* pMaterial = GetMaterialInternal(i);

		// fill in the lightmapped materials that are actually used by this level
		if( pMaterial->GetNeedsWhiteLightmap() && 
			( pMaterial->GetReferenceCount() > 0 ) )
		{
			// const IMaterialVar *pTransVar = pMaterial->GetMaterialProperty( MATERIAL_PROPERTY_OPACITY );
			//		if( ( !alpha && ( pTransVar->GetIntValue() == MATERIAL_TRANSLUCENT ) ) ||
			//			( alpha && !( pTransVar->GetIntValue() == MATERIAL_TRANSLUCENT ) ) )
			//		{
			//			return true;
			//		}

			pInfo[sortId].material = pMaterial->GetQueueFriendlyVersion();
			if( pMaterial->GetPropertyFlag( MATERIAL_PROPERTY_NEEDS_BUMPED_LIGHTMAPS ) )
			{
				pInfo[sortId].lightmapPageID = MATERIAL_SYSTEM_LIGHTMAP_PAGE_WHITE_BUMP;
			}
			else
			{
				pInfo[sortId].lightmapPageID = MATERIAL_SYSTEM_LIGHTMAP_PAGE_WHITE;
			}

			sortId++;
		}
	}
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
void CMatLightmaps::GetSortInfo( MaterialSystem_SortInfo_t *pSortInfoArray )
{
	// sort non-alpha blended materials first
	int sortId = 0;
	ComputeSortInfo( pSortInfoArray, sortId, false );
	ComputeWhiteLightmappedSortInfo( pSortInfoArray, sortId, false );
	Assert( m_numSortIDs == sortId );
}

//-----------------------------------------------------------------------------
//
//-----------------------------------------------------------------------------
void CMatLightmaps::EnableLightmapFiltering( bool enabled )
{
	int i;
	for( i = 0; i < GetNumLightmapPages(); i++ )
	{
		g_pShaderAPI->ModifyTexture( m_LightmapPageTextureHandles[i] );
		if( enabled )
		{
			g_pShaderAPI->TexMinFilter( SHADER_TEXFILTERMODE_LINEAR );
			g_pShaderAPI->TexMagFilter( SHADER_TEXFILTERMODE_LINEAR );
		}
		else
		{
			g_pShaderAPI->TexMinFilter( SHADER_TEXFILTERMODE_NEAREST );
			g_pShaderAPI->TexMagFilter( SHADER_TEXFILTERMODE_NEAREST );
		}
	}
}