//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
//
// Purpose: 
//
// $NoKeywords: $
//
//===========================================================================//

#ifndef PIXELWRITER_H
#define PIXELWRITER_H

#ifdef _WIN32
#pragma once
#endif

#if defined( _WIN32 ) || defined( _PS3 )
#define FORCEINLINE_PIXEL FORCEINLINE
#elif POSIX
#define FORCEINLINE_PIXEL inline
#else
#error "implement me"
#endif

// This flag allows us to write to formats we we don't support direct pixel access
// (like DXT1) without spewing errors.  The only actions that are available for
// these formats are direct access to the bitstream.
#define ALLOW_UNSUPPORTED_FORMATS 1

#include "bitmap/imageformat.h"
#include "tier0/dbg.h"
#include "mathlib/compressed_vector.h"
#include "mathlib/ssemath.h"
#include "mathlib/vector4d.h"
#include "cache_hints.h"

//-----------------------------------------------------------------------------
// Color writing class 
//-----------------------------------------------------------------------------

class CPixelWriter
{
public:
	FORCEINLINE void SetPixelMemory( ImageFormat format, void* pMemory, int stride );
	FORCEINLINE void *GetPixelMemory() { return m_pBase; }

	// this is no longer used:
#if 0 // defined( _X360 )
	// set after SetPixelMemory() 
	FORCEINLINE void ActivateByteSwapping( bool bSwap );
#endif

	FORCEINLINE void Seek( int x, int y );
	FORCEINLINE void* SkipBytes( int n ) RESTRICT;
	FORCEINLINE void SkipPixels( int n );	
	FORCEINLINE void WritePixel( int r, int g, int b, int a = 255 );
	FORCEINLINE void WritePixelNoAdvance( int r, int g, int b, int a = 255 );
	FORCEINLINE void WritePixelSigned( int r, int g, int b, int a = 255 );
	FORCEINLINE void WritePixelNoAdvanceSigned( int r, int g, int b, int a = 255 );
	FORCEINLINE void ReadPixelNoAdvance( int &r, int &g, int &b, int &a );

	// Floating point formats
	FORCEINLINE void WritePixelNoAdvanceF( float r, float g, float b, float a = 1.0f );
	FORCEINLINE void WritePixelF( float r, float g, float b, float a = 1.0f );
	FORCEINLINE void WriteManyPixelF( const float * RESTRICT pSrc, const int num  ); // write a contiguous stream of 4-floats. 


	// SIMD formats
	FORCEINLINE void WritePixel( FLTX4 rgba ) RESTRICT;
	FORCEINLINE void WritePixelNoAdvance( FLTX4 rgba ) RESTRICT;

#if defined ( _X360 ) || defined  ( _PS3 )
	// here are some explicit formats so we can avoid the switch:
	FORCEINLINE void WritePixelNoAdvance_RGBA8888( FLTX4 rgba );
	FORCEINLINE void WritePixelNoAdvance_BGRA8888( FLTX4 rgba );
	// as above, but with m_pBits passed in to avoid a LHS
	FORCEINLINE void WritePixelNoAdvance_BGRA8888( FLTX4 rgba, void *pBits ) RESTRICT;
	// for writing entire SIMD registers at once when they have
	// already been packed, and when m_pBits is vector-aligned
	// (which is a requirement for write-combined memory)
	// offset is added to m_pBits (saving you from the obligatory
	// LHS of a SkipBytes)
	FORCEINLINE void WriteFourPixelsExplicitLocation_BGRA8888( FLTX4 rgba, int offset );

	FORCEINLINE void WritePixelNoAdvance_RGBA16161616( FLTX4 rgba );
#endif

	FORCEINLINE void WritePixelNoAdvance16F( float r, float g, float b, float a );


	FORCEINLINE unsigned char GetPixelSize() { return m_Size; }	
	FORCEINLINE unsigned short GetBytesPerRow() { return m_BytesPerRow; }

	FORCEINLINE bool IsUsingFloatFormat() const;
	FORCEINLINE bool IsUsing16BitFloatFormat() const;

	// We allow "unsupported" formats only if you are writing directly into the bitstream
	FORCEINLINE bool IsUsingSupportedFormat() const;

	FORCEINLINE unsigned char *GetCurrentPixel() { return m_pBits; }

private:
	// helper functions for some explicit combinations of flags and sizes -- lets us
	// do some conversions on the GPRs using bitshifts rather than a round trip to the
	// FPU and a LHS.
	FORCEINLINE void WriteManyPixelTo16BitF( const float * RESTRICT pSrc, int num  ) RESTRICT; // write a contiguous stream of 4-floats. 
	// FORCEINLINE void WriteManyPixelTo32BitF( const float * RESTRICT pSrc, const int num  ); // write a contiguous stream of 4-floats. 

	FORCEINLINE void AssertFormatIsSupported( ImageFormat format ) const;

	enum
	{
		PIXELWRITER_USING_FLOAT_FORMAT       = 0x01,
		PIXELWRITER_USING_16BIT_FLOAT_FORMAT = 0x02,
		PIXELWRITER_SWAPBYTES                = 0x04,
		PIXELWRITER_USING_UNSUPPORTED_FORMAT = 0x08,
	};

	unsigned char*	m_pBase;
	unsigned char*	m_pBits;
	unsigned short	m_BytesPerRow;
	unsigned char	m_Size;
	unsigned char	m_nFlags;
	signed short	m_RShift;
	signed short	m_GShift;
	signed short	m_BShift;
	signed short	m_AShift;
	unsigned int	m_RMask;
	unsigned int	m_GMask;
	unsigned int	m_BMask;
	unsigned int	m_AMask;

#if defined ( _X360 ) || defined  ( _PS3 )
	ImageFormat		m_Format;
public:
	inline const ImageFormat &GetFormat() { return m_Format; }
private:
#endif
};

FORCEINLINE_PIXEL bool CPixelWriter::IsUsingFloatFormat() const
{
	return (m_nFlags & PIXELWRITER_USING_FLOAT_FORMAT) != 0;
}

FORCEINLINE_PIXEL bool CPixelWriter::IsUsing16BitFloatFormat() const
{
	return (m_nFlags & PIXELWRITER_USING_16BIT_FLOAT_FORMAT) != 0;
}

FORCEINLINE_PIXEL bool CPixelWriter::IsUsingSupportedFormat() const
{
	return (m_nFlags & PIXELWRITER_USING_UNSUPPORTED_FORMAT) == 0;
}

FORCEINLINE_PIXEL void CPixelWriter::SetPixelMemory( ImageFormat format, void* pMemory, int stride )
{
	m_pBits = (unsigned char*)pMemory;
	m_pBase = m_pBits;
	m_BytesPerRow = (unsigned short)stride;
	m_nFlags = 0;
#if defined ( _X360 ) || defined  ( _PS3 )
	m_Format = format;
#endif

	switch ( format )
	{
	case IMAGE_FORMAT_R32F: // NOTE! : the low order bits are first in this naming convention.
		m_Size = 4;
		m_RShift = 0;
		m_GShift = 0;
		m_BShift = 0;
		m_AShift = 0;
		m_RMask = 0xFFFFFFFF;
		m_GMask = 0x0;
		m_BMask = 0x0;
		m_AMask = 0x0;
		m_nFlags |= PIXELWRITER_USING_FLOAT_FORMAT;
		break;

	case IMAGE_FORMAT_RGBA32323232F:
		m_Size = 16;
		m_RShift = 0;
		m_GShift = 32;
		m_BShift = 64;
		m_AShift = 96;
		m_RMask = 0xFFFFFFFF;
		m_GMask = 0xFFFFFFFF;
		m_BMask = 0xFFFFFFFF;
		m_AMask = 0xFFFFFFFF;
		m_nFlags |= PIXELWRITER_USING_FLOAT_FORMAT;
		break;

	case IMAGE_FORMAT_RGBA16161616F:
		m_Size = 8;
		m_RShift = 0;
		m_GShift = 16;
		m_BShift = 32;
		m_AShift = 48;
		m_RMask = 0xFFFF;
		m_GMask = 0xFFFF;
		m_BMask = 0xFFFF;
		m_AMask = 0xFFFF;
		m_nFlags |= PIXELWRITER_USING_FLOAT_FORMAT | PIXELWRITER_USING_16BIT_FLOAT_FORMAT;
		break;

	case IMAGE_FORMAT_RGBA8888:
#if defined( _X360 )
	case IMAGE_FORMAT_LINEAR_RGBA8888:
#endif
		m_Size = 4;
		m_RShift = 0;
		m_GShift = 8;
		m_BShift = 16;
		m_AShift = 24;
		m_RMask = 0xFF;
		m_GMask = 0xFF;
		m_BMask = 0xFF;
		m_AMask = 0xFF;
		break;

	case IMAGE_FORMAT_BGRA1010102: // NOTE! : the low order bits are first in this naming convention.
		m_Size = 4;
		m_RShift = 20;
		m_GShift = 10;
		m_BShift = 0;
		m_AShift = 30;
		m_RMask = 0x3FF;
		m_GMask = 0x3FF;
		m_BMask = 0x3FF;
		m_AMask = 0x03;
		break;

	case IMAGE_FORMAT_BGRA8888: // NOTE! : the low order bits are first in this naming convention.
#if defined( _X360 )
	case IMAGE_FORMAT_LINEAR_BGRA8888:
#endif
		m_Size = 4;
		m_RShift = 16;
		m_GShift = 8;
		m_BShift = 0;
		m_AShift = 24;
		m_RMask = 0xFF;
		m_GMask = 0xFF;
		m_BMask = 0xFF;
		m_AMask = 0xFF;
		break;

	case IMAGE_FORMAT_BGRX8888:
#if defined( _X360 )
	case IMAGE_FORMAT_LINEAR_BGRX8888:
#endif
		m_Size = 4;
		m_RShift = 16;
		m_GShift = 8;
		m_BShift = 0;
		m_AShift = 24;
		m_RMask = 0xFF;
		m_GMask = 0xFF;
		m_BMask = 0xFF;
		m_AMask = 0x00;
		break;

	case IMAGE_FORMAT_BGRA4444:
		m_Size = 2;
		m_RShift = 4;
		m_GShift = 0;
		m_BShift = -4;
		m_AShift = 8;
		m_RMask = 0xF0;
		m_GMask = 0xF0;
		m_BMask = 0xF0;
		m_AMask = 0xF0;
		break;

	case IMAGE_FORMAT_BGR888:
		m_Size = 3;
		m_RShift = 16;
		m_GShift = 8;
		m_BShift = 0;
		m_AShift = 0;
		m_RMask = 0xFF;
		m_GMask = 0xFF;
		m_BMask = 0xFF;
		m_AMask = 0x00;
		break;

	case IMAGE_FORMAT_BGR565:
		m_Size = 2;
		m_RShift = 8;
		m_GShift = 3;
		m_BShift = -3;
		m_AShift = 0;
		m_RMask = 0xF8;
		m_GMask = 0xFC;
		m_BMask = 0xF8;
		m_AMask = 0x00;
		break;

	case IMAGE_FORMAT_BGRA5551:
	case IMAGE_FORMAT_BGRX5551:
		m_Size = 2;
		m_RShift = 7;
		m_GShift = 2;
		m_BShift = -3;
		m_AShift = 8;
		m_RMask = 0xF8;
		m_GMask = 0xF8;
		m_BMask = 0xF8;
		m_AMask = 0x80;
		break;

	// GR - alpha format for HDR support
	case IMAGE_FORMAT_A8:
#if defined( _X360 )
	case IMAGE_FORMAT_LINEAR_A8:
#endif
		m_Size = 1;
		m_RShift = 0;
		m_GShift = 0;
		m_BShift = 0;
		m_AShift = 0;
		m_RMask = 0x00;
		m_GMask = 0x00;
		m_BMask = 0x00;
		m_AMask = 0xFF;
		break;

	case IMAGE_FORMAT_UVWQ8888:
		m_Size = 4;
		m_RShift = 0;
		m_GShift = 8;
		m_BShift = 16;
		m_AShift = 24;
		m_RMask = 0xFF;
		m_GMask = 0xFF;
		m_BMask = 0xFF;
		m_AMask = 0xFF;
		break;

	case IMAGE_FORMAT_RGBA16161616:
#if defined( _X360 )
	case IMAGE_FORMAT_LINEAR_RGBA16161616:
#endif		
		m_Size = 8;
		if ( !IsX360() )
		{
			m_RShift = 0;
			m_GShift = 16;
			m_BShift = 32;
			m_AShift = 48;
		}
		else
		{
			m_RShift = 48;
			m_GShift = 32;
			m_BShift = 16;
			m_AShift = 0;
		}
		m_RMask = 0xFFFF;
		m_GMask = 0xFFFF;
		m_BMask = 0xFFFF;
		m_AMask = 0xFFFF;
		break;

	case IMAGE_FORMAT_I8:
#if defined( _X360 )
	case IMAGE_FORMAT_LINEAR_I8:
#endif
		// whatever goes into R is considered the intensity.
		m_Size = 1;
		m_RShift = 0;
		m_GShift = 0;
		m_BShift = 0;
		m_AShift = 0;
		m_RMask = 0xFF;
		m_GMask = 0x00;
		m_BMask = 0x00;
		m_AMask = 0x00;
		break;
	// FIXME: Add more color formats as need arises
	default:
		{
#if ALLOW_UNSUPPORTED_FORMATS
			m_nFlags |= PIXELWRITER_USING_UNSUPPORTED_FORMAT;
#else // ALLOW_UNSUPPORTED_FORMATS
			static bool format_error_printed[NUM_IMAGE_FORMATS];
			if ( !format_error_printed[format] )
			{
				Assert( 0 );
				Msg( "CPixelWriter::SetPixelMemory:  Unsupported image format %i\n", format );
				format_error_printed[format] = true;
			}
#endif // ALLOW_UNSUPPORTED_FORMATS
			m_Size = 0; // set to zero so that we don't stomp memory for formats that we don't understand.
            m_RShift = 0;
            m_GShift = 0;
            m_BShift = 0;
            m_AShift = 0;
            m_RMask  = 0xFF;
            m_GMask  = 0x00;
            m_BMask  = 0x00;
            m_AMask  = 0x00;
		}
		break;
	}
}

#if 0 // defined( _X360 )
FORCEINLINE void CPixelWriter::ActivateByteSwapping( bool bSwap )
{
	// X360TBD: Who is trying to use this?
	// Purposely not hooked up because PixelWriter has been ported to read/write native pixels only
	Assert( 0 );

	if ( bSwap && !(m_nFlags & PIXELWRITER_SWAPBYTES ) )
	{
		m_nFlags |= PIXELWRITER_SWAPBYTES;

		// only tested with 4 byte formats
		Assert( m_Size == 4 );
	}
	else if ( !bSwap && (m_nFlags & PIXELWRITER_SWAPBYTES ) )
	{
		m_nFlags &= ~PIXELWRITER_SWAPBYTES;
	}
	else
	{
		// same state
		return;
	}

	// swap the shifts
	m_RShift = 24-m_RShift;
	m_GShift = 24-m_GShift;
	m_BShift = 24-m_BShift;
	m_AShift = 24-m_AShift;
}
#endif

//-----------------------------------------------------------------------------
// Sets where we're writing to
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::Seek( int x, int y )
{
	Assert( IsUsingSupportedFormat() );
	m_pBits = m_pBase + y * m_BytesPerRow + x * m_Size;
}


//-----------------------------------------------------------------------------
// Skips n bytes:
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void* CPixelWriter::SkipBytes( int n ) RESTRICT
{
	m_pBits += n;
	return m_pBits;
}


//-----------------------------------------------------------------------------
// Skips n pixels:
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::SkipPixels( int n )
{
	Assert( IsUsingSupportedFormat() );
	SkipBytes( n * m_Size );
}

//-----------------------------------------------------------------------------
// Writes a pixel without advancing the index		PC ONLY
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WritePixelNoAdvanceF( float r, float g, float b, float a )
{
	Assert( IsUsingSupportedFormat() );
	Assert( IsUsingFloatFormat() );

	// X360TBD: Not ported
	Assert( IsPC() || IsPS3() );

	if (PIXELWRITER_USING_16BIT_FLOAT_FORMAT & m_nFlags)
	{
		WritePixelNoAdvance16F( r,g,b,a );
	}
	else
	{
		// fp32
		int pBuf[4] = { 0, 0, 0, 0 };
		pBuf[ m_RShift >> 5 ] |= (FloatBits(r) & m_RMask) << ( m_RShift & 0x1F );
		pBuf[ m_GShift >> 5 ] |= (FloatBits(g) & m_GMask) << ( m_GShift & 0x1F );
		pBuf[ m_BShift >> 5 ] |= (FloatBits(b) & m_BMask) << ( m_BShift & 0x1F );
		pBuf[ m_AShift >> 5 ] |= (FloatBits(a) & m_AMask) << ( m_AShift & 0x1F );
		memcpy( m_pBits, pBuf, m_Size );
	}
}


FORCEINLINE void CPixelWriter::WritePixelNoAdvance16F( float r, float g, float b, float a )
{
	if ( IsPS3() )
	{
		// we know what the values of shift and mask are going to be because 
		// of the format, so we can elide them and write directly
		float16 *fp16 = reinterpret_cast<float16 *>(m_pBits);

		fp16[0].SetFloat( r );
		fp16[1].SetFloat( g );
		fp16[2].SetFloat( b );
		fp16[3].SetFloat( a );
	}
	else
	{
		float16 fp16[4];
		fp16[0].SetFloat( r );
		fp16[1].SetFloat( g );
		fp16[2].SetFloat( b );
		fp16[3].SetFloat( a );
		// fp16
		unsigned short pBuf[4] = { 0, 0, 0, 0 };
		pBuf[ m_RShift >> 4 ] |= (fp16[0].GetBits() & m_RMask) << ( m_RShift & 0xF );
		pBuf[ m_GShift >> 4 ] |= (fp16[1].GetBits() & m_GMask) << ( m_GShift & 0xF );
		pBuf[ m_BShift >> 4 ] |= (fp16[2].GetBits() & m_BMask) << ( m_BShift & 0xF );
		pBuf[ m_AShift >> 4 ] |= (fp16[3].GetBits() & m_AMask) << ( m_AShift & 0xF );
		memcpy( m_pBits, pBuf, m_Size );
	}
}

//-----------------------------------------------------------------------------
// Writes a lot of pixels, efficiently		
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WriteManyPixelTo16BitF( const float * RESTRICT pSrc, int num  ) RESTRICT
{
	Assert( IsUsingSupportedFormat() );
	const static int SIZE = 4*sizeof(unsigned short); // known precondition
	const static int MASK = 0xFFFF;
	// another known precondition:  m_RShift == 0 && m_GShift == 16 && m_BShift == 32 && m_AShift == 48 


	unsigned char *pBits = m_pBits; // compiler actually fails to hoist this onto a register properly otherwise.
	for ( int i = 0; num; --num, ++i )
	{
		/* // this actually slowed things down, for whatever perverse reason.
		// every cache line boundary, prefetch the next in bloc, so long as we've at least 128 bytes left to go.
		// the destination is in noncacheable memory.
		if ( (num > 32) && ( (reinterpret_cast<unsigned int>(pSrc) & 127) == 0 ) )
		{
			PREFETCH_128( pSrc, 128 );
		}
		*/

		float16 * RESTRICT pOut = reinterpret_cast< float16 * >(pBits);
		pOut[0].SetFloat( pSrc[0] );
		pOut[1].SetFloat( pSrc[1] );
		pOut[2].SetFloat( pSrc[2] );
		pOut[3].SetFloat( pSrc[3] );

		/*
		pAck[i+0].SetFloat( pSrc[0] );
		pAck[i+1].SetFloat( pSrc[1] );
		pAck[i+2].SetFloat( pSrc[2] );
		pAck[i+3].SetFloat( pSrc[3] );
		*/

		pSrc += 4;
		pBits += SIZE;
	}
	

	m_pBits = pBits;
}


//-----------------------------------------------------------------------------
// Writes a pixel, advances the write index 
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WritePixelF( float r, float g, float b, float a )
{
	WritePixelNoAdvanceF(r, g, b, a);
	m_pBits += m_Size;
}

//-----------------------------------------------------------------------------
// Writes an array of pixels, advancing the write index. 
// the input data is required to be a contiguous stream of Vector4Ds
// (ie, each pixel consists of four consecutive floats, and the data is 
//  consecutive in memory)
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WriteManyPixelF( const float * RESTRICT pSrc, const int num  )
{
	Assert( IsUsingSupportedFormat() );
	Assert( IsUsingFloatFormat() );
	// X360TBD: Not ported
	Assert( IsPC() || IsPS3() );

	if ( m_Size == 4*sizeof(unsigned short) && (PIXELWRITER_USING_16BIT_FLOAT_FORMAT & m_nFlags) )
	{
		Assert( m_RShift == 0 && m_GShift == 16 && m_BShift == 32 && m_AShift == 48 );
		WriteManyPixelTo16BitF( pSrc, num );
	}
	/*
	else if ( m_Size == 4*sizeof(int) && !(PIXELWRITER_USING_16BIT_FLOAT_FORMAT & m_nFlags) )
	{
		WriteManyPixelTo32BitF( pSrc, num );
	}
	*/
	else for ( const float * const sentinel = pSrc + ( num * 4 ); pSrc < sentinel; pSrc += 4 ) // naive general case
	{
		WritePixelF( pSrc[0], pSrc[1], pSrc[2], pSrc[3] );
	}
}

//-----------------------------------------------------------------------------
// Writes a pixel, advances the write index 
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WritePixel( int r, int g, int b, int a )
{
	WritePixelNoAdvance(r,g,b,a);
	m_pBits += m_Size;
}

//-----------------------------------------------------------------------------
// Writes a pixel, advances the write index 
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WritePixelSigned( int r, int g, int b, int a )
{
	WritePixelNoAdvanceSigned(r,g,b,a);
	m_pBits += m_Size;
}


//-----------------------------------------------------------------------------
// Writes a pixel without advancing the index
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WritePixelNoAdvance( int r, int g, int b, int a )
{
	Assert( IsUsingSupportedFormat() );
	Assert( !IsUsingFloatFormat() );

	if ( m_Size <= 0 )
	{
		return;
	}
	if ( m_Size < 5 )
	{
		unsigned int val = (r & m_RMask) << m_RShift;
		val |=  (g & m_GMask) << m_GShift;
		val |= (m_BShift > 0) ? ((b & m_BMask) << m_BShift) : ((b & m_BMask) >> -m_BShift);
		val |=	(a & m_AMask) << m_AShift;

		switch( m_Size )
		{
		default:
			Assert( 0 );
			return;
		case 1:
			{
				m_pBits[0] = (unsigned char)((val & 0xff));
				return;
			}
		case 2:
			{
				((unsigned short *)m_pBits)[0] = (unsigned short)((val & 0xffff));
				return;
			}
		case 3:
			{
				if ( IsPC() || IsPS3() || !IsX360() )
				{
					((unsigned short *)m_pBits)[0] = (unsigned short)((val & 0xffff));
					m_pBits[2] = (unsigned char)((val >> 16) & 0xff);
				}
				else
				{
					m_pBits[0] = (unsigned char)(((val >> 16) & 0xff));
					m_pBits[1] = (unsigned char)(((val >> 8 ) & 0xff));
					m_pBits[2] = (unsigned char)(val & 0xff);
				}
				return;
			}
		case 4:
			{
				((unsigned int *)m_pBits)[0] = val;
				return;
			}
		}
	}
	else	// RGBA32323232 or RGBA16161616 -- PC only.
	{
		AssertMsgOnce(!IsX360(), "Unsupported lightmap format used in WritePixelNoAdvance(). This is a severe performance fault.\n");
//		AssertMsg(!IsX360(), "Unsupported lightmap format used in WritePixelNoAdvance(). This is a severe performance fault.\n");

		int64 val = ( ( int64 )(r & m_RMask) ) << m_RShift;
		val |=  ( ( int64 )(g & m_GMask) ) << m_GShift;
		val |= (m_BShift > 0) ? ((( int64 )( b & m_BMask)) << m_BShift) : (((int64)( b & m_BMask)) >> -m_BShift);
		val |=	( ( int64 )(a & m_AMask) ) << m_AShift;

		switch( m_Size )
		{
		case 6:
			{
				if ( IsPC() || IsPS3() || !IsX360() )
				{
					((unsigned int *)m_pBits)[0] = val & 0xffffffff;
					((unsigned short *)m_pBits)[2] = (unsigned short)( ( val >> 32 ) & 0xffff );
				}
				else
				{
					((unsigned int *)m_pBits)[0] = (val >> 16) & 0xffffffff;
					((unsigned short *)m_pBits)[2] = (unsigned short)( val & 0xffff );
				}
				return;
			}
		case 8:
			{
				if ( IsPC() || IsPS3() || !IsX360() )
				{
					((unsigned int *)m_pBits)[0] = val & 0xffffffff;
					((unsigned int *)m_pBits)[1] = ( val >> 32 ) & 0xffffffff;
				}
				else
				{
					((unsigned int *)m_pBits)[0] = ( val >> 32 ) & 0xffffffff;
					((unsigned int *)m_pBits)[1] = val & 0xffffffff;
				}
				return;
			}
		default:
			Assert( 0 );
			return;
		}
	}
}


#ifdef _X360
// There isn't a PC port of these because of the many varied
// pixel formats the PC deals with. If you write SSE versions 
// of all the various necessary packers, then this can be made
// to work on PC.

//-----------------------------------------------------------------------------
// Writes a pixel, advances the write index 
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WritePixel( FLTX4 rgba ) RESTRICT
{
	WritePixelNoAdvance(rgba);
	m_pBits += m_Size;
}

//-----------------------------------------------------------------------------
// Writes a pixel without advancing the index
// rgba are four float values, each on the range 0..255 (though they may leak
// fractionally over 255 due to numerical errors earlier)
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WritePixelNoAdvance( FLTX4 rgba ) RESTRICT
{
	Assert( IsUsingSupportedFormat() );
	Assert( !IsUsingFloatFormat() );

	switch (m_Size)
	{
	case 0:
		return;
	case 4:
	{
		AssertMsg((reinterpret_cast<unsigned int>(m_pBits) & 0x03) == 0,"Unaligned m_pBits in WritePixelNoAdvance!");
		switch ( m_Format )
		{
			// note: format names are low-order-byte first. 
		case IMAGE_FORMAT_RGBA8888:
		case IMAGE_FORMAT_LINEAR_RGBA8888:
			WritePixelNoAdvance_RGBA8888(rgba);
			break;

		case IMAGE_FORMAT_BGRA8888: // NOTE! : the low order bits are first in this naming convention.
		case IMAGE_FORMAT_LINEAR_BGRA8888:
			WritePixelNoAdvance_BGRA8888(rgba);
			break;
			

		default:
			AssertMsg1(false, "Unknown four-byte pixel format %d in lightmap write.\n", m_Format);
		}
		break;
	}

	case 8:
	{
		switch ( m_Format )
		{
			// note: format names are low-order-byte first. 
		case IMAGE_FORMAT_RGBA16161616:
		case IMAGE_FORMAT_LINEAR_RGBA16161616:
			WritePixelNoAdvance_RGBA16161616(rgba);
			break;

		default:
			AssertMsg1(false, "Unknown eight-byte pixel format %d in lightmap write.\n", m_Format);
		}
		break;

	}
	default:
		AssertMsg1(false, "WritePixelNoAdvance on unsupported 360 %d-byte format\n", m_Size);
		break;
	}

}


// here are some explicit formats so we can avoid the switch:
FORCEINLINE void CPixelWriter::WritePixelNoAdvance_RGBA8888( FLTX4 rgba )
{
	// it's easier to do tiered convert-saturates here 
	// than  the d3d color convertor op

	// first permute
	const static fltx4 permReverse = XMVectorPermuteControl(3,2,1,0);
	fltx4 N = XMVectorPermute(rgba, rgba, permReverse);

	N = __vctuxs(N, 0); // convert to unsigned fixed point 0 w/ saturate
	N = __vpkuwus(N, N); // convert to halfword saturate
	N = __vpkuhus(N, N); // convert to byte saturate
	N = __vspltw(N, 0);  // splat w-word to all four

	__stvewx(N, m_pBits, 0); // store whatever word happens to be aligned with m_pBits to that word 
}

FORCEINLINE void CPixelWriter::WritePixelNoAdvance_BGRA8888( FLTX4 rgba )
{
	WritePixelNoAdvance_BGRA8888( rgba, m_pBits );
}

FORCEINLINE void CPixelWriter::WritePixelNoAdvance_RGBA16161616( FLTX4 rgba )
{
	// input is in 0..16 range. 
	//Multiply by 4096 to get into 0..65536 range
	static const fltx4 vMult = { 4096.0f, 4096.0f, 4096.0f, 65536.0f };
	rgba = XMVectorMultiply( rgba, vMult );
	XMStoreUShort4( (XMUSHORT4*)m_pBits, rgba );
}

FORCEINLINE void CPixelWriter::WritePixelNoAdvance_BGRA8888( FLTX4 rgba, void * RESTRICT pBits ) RESTRICT
{
	// this happens to be in an order such that we can use the handy builtin packing op
	// clamp to 0..255 (coz it might have leaked over)
	static const fltx4 vTwoFiftyFive = {255.0f, 255.0f, 255.0f, 255.0f};
	fltx4 N = MinSIMD(vTwoFiftyFive, rgba); 

	// the magic number such that when mul-accummulated against rbga,
	// gets us a representation 3.0 + (r)*2^-22 -- puts the bits at
	// the bottom of the float
	static CONST XMVECTOR   PackScale = { (1.0f / (FLOAT)(1 << 22)), (1.0f / (FLOAT)(1 << 22)), (1.0f / (FLOAT)(1 << 22)), (1.0f / (FLOAT)(1 << 22))}; // 255.0f / (FLOAT)(1 << 22)
	static const XMVECTOR   Three = {3.0f, 3.0f, 3.0f, 3.0f};

	N = __vmaddfp(N, PackScale, Three);
	N = __vpkd3d(N, N, VPACK_D3DCOLOR, VPACK_32, 3); // pack to X word
	N = __vspltw(N, 0); // splat X

	// this is a nasty thing to work around the April XDK bug in __stvewx
	{
		void * RESTRICT copyOfPBits = pBits;
		__stvewx(N, copyOfPBits, 0);
	}

}

// for writing entire SIMD registers at once
FORCEINLINE void CPixelWriter::WriteFourPixelsExplicitLocation_BGRA8888 ( FLTX4 rgba, int offset )
{
	Assert( (reinterpret_cast<unsigned int>(m_pBits) & 15) == 0 ); // assert alignment
	XMStoreVector4A( m_pBits + offset , rgba );
}

#elif defined ( _PS3 )

// There isn't a PC port of these because of the many varied
// pixel formats the PC deals with. If you write SSE versions 
// of all the various necessary packers, then this can be made
// to work on PC.

//-----------------------------------------------------------------------------
// Writes a pixel, advances the write index 
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WritePixel( FLTX4 rgba ) RESTRICT
{
	WritePixelNoAdvance(rgba);
	m_pBits += m_Size;
}

//-----------------------------------------------------------------------------
// Writes a pixel without advancing the index
// rgba are four float values, each on the range 0..255 (though they may leak
// fractionally over 255 due to numerical errors earlier)
//-----------------------------------------------------------------------------
FORCEINLINE_PIXEL void CPixelWriter::WritePixelNoAdvance( FLTX4 rgba ) RESTRICT
{
	Assert( IsUsingSupportedFormat() );
	Assert( !IsUsingFloatFormat() );

	switch (m_Size)
	{
	case 0:
		return;
	case 4:
		{
			AssertMsg((reinterpret_cast<unsigned int>(m_pBits) & 0x03) == 0,"Unaligned m_pBits in WritePixelNoAdvance!");
			switch ( m_Format )
			{
				// note: format names are low-order-byte first. 
			case IMAGE_FORMAT_RGBA8888:
			case IMAGE_FORMAT_LINEAR_RGBA8888:
				WritePixelNoAdvance_RGBA8888(rgba);
				break;

			case IMAGE_FORMAT_BGRA8888: // NOTE! : the low order bits are first in this naming convention.
			//EAPS3	case IMAGE_FORMAT_LINEAR_BGRA8888:
				WritePixelNoAdvance_BGRA8888(rgba);
				break;


			default:
				AssertMsg1(false, "Unknown four-byte pixel format %d in lightmap write.\n", m_Format);
			}
			break;
		}

	default:
		AssertMsg1(false, "WritePixelNoAdvance on unsupported 360 %d-byte format\n", m_Size);
		break;
	}

}


// here are some explicit formats so we can avoid the switch:
FORCEINLINE void CPixelWriter::WritePixelNoAdvance_RGBA8888( FLTX4 rgba )
{
	// it's easier to do tiered convert-saturates here 
	// than  the d3d color convertor op

	// first permute

	fltx4 N = vec_perm(rgba, rgba, _VEC_SWIZZLE_WZYX);

	vector unsigned int   N_ui = vec_ctu(N, 0); // convert to unsigned fixed point 0 w/ saturate
	vector unsigned short N_us = vec_packsu(N_ui, N_ui); // convert to halfword saturate
	vector unsigned char  N_uc = vec_packsu(N_us, N_us); // convert to byte saturate
// don't need to do this, should already be unpacked to all elements in the same way
//	N = vec_splat((fltx4)N_uc, 0);  // splat w-word to all four
//	vec_ste(N, 0, m_pBits); // store whatever word happens to be aligned with m_pBits to that word 

	vec_ste((vec_uint4)N_uc, 0, (unsigned int *)m_pBits); // store whatever word happens to be aligned with m_pBits to that word 
}

FORCEINLINE void CPixelWriter::WritePixelNoAdvance_BGRA8888( FLTX4 rgba )
{
	WritePixelNoAdvance_BGRA8888( rgba, m_pBits );
}

FORCEINLINE void CPixelWriter::WritePixelNoAdvance_BGRA8888( FLTX4 rgba, void * RESTRICT pBits ) RESTRICT
{
	fltx4 N;
	vector unsigned int   N_ui = vec_ctu(rgba, 0); // convert to unsigned fixed point 0 w/ saturate
	vector unsigned short N_us = vec_packsu(N_ui, N_ui); // convert to halfword saturate
	vector unsigned char  N_uc = vec_packsu(N_us, N_us); // convert to byte saturate
//	N = vec_splat((fltx4)N_uc, 0);  // splat w-word to all four
//	vec_ste(N, 0, (float*)pBits); // store whatever word happens to be aligned with m_pBits to that word 

	vec_ste((vec_uint4)N_uc, 0, (unsigned int*)pBits); // store whatever word happens to be aligned with m_pBits to that word 

}

// for writing entire SIMD registers at once
FORCEINLINE void CPixelWriter::WriteFourPixelsExplicitLocation_BGRA8888 ( FLTX4 rgba, int offset )
{
	Assert( (reinterpret_cast<unsigned int>(m_pBits) & 15) == 0 && offset == 0 ); // assert alignment
//	XMStoreVector4A( m_pBits + offset , rgba );
	vec_st( rgba, offset, (float*)m_pBits );

}

#endif

//-----------------------------------------------------------------------------
// Writes a signed pixel without advancing the index
//-----------------------------------------------------------------------------

FORCEINLINE_PIXEL void CPixelWriter::WritePixelNoAdvanceSigned( int r, int g, int b, int a )
{
	Assert( IsUsingSupportedFormat() );
	Assert( !IsUsingFloatFormat() );

	if ( m_Size <= 0 )
	{
		return;
	}

	if ( m_Size < 5 )
	{
		int val = (r & m_RMask) << m_RShift;
		val |=  (g & m_GMask) << m_GShift;
		val |= (m_BShift > 0) ? ((b & m_BMask) << m_BShift) : ((b & m_BMask) >> -m_BShift);
		val |=	(a & m_AMask) << m_AShift;
		signed char *pSignedBits = (signed char *)m_pBits;

		if ( IsPC() || IsPS3() || !IsX360() )
		{
			switch ( m_Size )
			{
			case 4:
				pSignedBits[3] = (signed char)((val >> 24) & 0xff);
				// fall through intentionally.
			case 3:
				pSignedBits[2] = (signed char)((val >> 16) & 0xff);
				// fall through intentionally.
			case 2:
				pSignedBits[1] = (signed char)((val >> 8) & 0xff);
				// fall through intentionally.
			case 1:
				pSignedBits[0] = (signed char)((val & 0xff));
				// fall through intentionally.
				return;
			}
		}
		else
		{
			switch ( m_Size )
			{
			case 4:
				pSignedBits[0] = (signed char)((val >> 24) & 0xff);
				pSignedBits[1] = (signed char)((val >> 16) & 0xff);
				pSignedBits[2] = (signed char)((val >> 8) & 0xff);
				pSignedBits[3] = (signed char)(val & 0xff);
				break;
			case 3:
				pSignedBits[0] = (signed char)((val >> 16) & 0xff);
				pSignedBits[1] = (signed char)((val >> 8) & 0xff);
				pSignedBits[2] = (signed char)(val & 0xff);
				break;
			case 2:
				pSignedBits[0] = (signed char)((val >> 8) & 0xff);
				pSignedBits[1] = (signed char)(val & 0xff);
				break;
			case 1:
				pSignedBits[0] = (signed char)(val & 0xff);
				break;
			}
		}
	}
	else
	{
		int64 val = ( ( int64 )(r & m_RMask) ) << m_RShift;
		val |=  ( ( int64 )(g & m_GMask) ) << m_GShift;
		val |= (m_BShift > 0) ? ((( int64 )( b & m_BMask)) << m_BShift) : (((int64)( b & m_BMask)) >> -m_BShift);
		val |=	( ( int64 )(a & m_AMask) ) << m_AShift;
		signed char *pSignedBits = ( signed char * )m_pBits;

		if ( IsPC() || IsPS3() || !IsX360() )
		{
			switch( m_Size )
			{
			case 8:
				pSignedBits[7] = (signed char)((val >> 56) & 0xff);
				pSignedBits[6] = (signed char)((val >> 48) & 0xff);
				// fall through intentionally.
			case 6:
				pSignedBits[5] = (signed char)((val >> 40) & 0xff);
				pSignedBits[4] = (signed char)((val >> 32) & 0xff);
				// fall through intentionally.
			case 4:
				pSignedBits[3] = (signed char)((val >> 24) & 0xff);
				// fall through intentionally.
			case 3:
				pSignedBits[2] = (signed char)((val >> 16) & 0xff);
				// fall through intentionally.
			case 2:
				pSignedBits[1] = (signed char)((val >> 8) & 0xff);
				// fall through intentionally.
			case 1:
				pSignedBits[0] = (signed char)((val & 0xff));
				break;
			default:
				Assert( 0 );
				return;
			}
		}
		else
		{
			switch( m_Size )
			{
			case 8:
				pSignedBits[0] = (signed char)((val >> 56) & 0xff);
				pSignedBits[1] = (signed char)((val >> 48) & 0xff);
				pSignedBits[2] = (signed char)((val >> 40) & 0xff);
				pSignedBits[3] = (signed char)((val >> 32) & 0xff);
				pSignedBits[4] = (signed char)((val >> 24) & 0xff);
				pSignedBits[5] = (signed char)((val >> 16) & 0xff);
				pSignedBits[6] = (signed char)((val >> 8) & 0xff);
				pSignedBits[7] = (signed char)(val & 0xff);
				break;
			case 6:
				pSignedBits[0] = (signed char)((val >> 40) & 0xff);
				pSignedBits[1] = (signed char)((val >> 32) & 0xff);
				pSignedBits[2] = (signed char)((val >> 24) & 0xff);
				pSignedBits[3] = (signed char)((val >> 16) & 0xff);
				pSignedBits[4] = (signed char)((val >> 8) & 0xff);
				pSignedBits[5] = (signed char)(val & 0xff);
				break;
			case 4:
				pSignedBits[0] = (signed char)((val >> 24) & 0xff);
				pSignedBits[1] = (signed char)((val >> 16) & 0xff);
				pSignedBits[2] = (signed char)((val >> 8) & 0xff);
				pSignedBits[3] = (signed char)(val & 0xff);
				break;
			case 3:
				pSignedBits[0] = (signed char)((val >> 16) & 0xff);
				pSignedBits[1] = (signed char)((val >> 8) & 0xff);
				pSignedBits[2] = (signed char)(val & 0xff);
				break;	
			case 2:
				pSignedBits[0] = (signed char)((val >> 8) & 0xff);
				pSignedBits[1] = (signed char)(val & 0xff);
				break;
			case 1:
				pSignedBits[0] = (signed char)(val & 0xff);
				break;
			default:
				Assert( 0 );
				return;
			}
		}
	}
}

FORCEINLINE_PIXEL void CPixelWriter::ReadPixelNoAdvance( int &r, int &g, int &b, int &a )
{
	Assert( IsUsingSupportedFormat() );
	Assert( !IsUsingFloatFormat() );

	int val = m_pBits[0];
	if ( m_Size > 1 )
	{
		if ( IsPC() || IsPS3() || !IsX360() )
		{
			val |= (int)m_pBits[1] << 8;
			if ( m_Size > 2 )
			{
				val |= (int)m_pBits[2] << 16;
				if ( m_Size > 3 )
				{
					val |= (int)m_pBits[3] << 24;
				}
			}
		}
		else
		{
			val <<= 8;
			val |= (int)m_pBits[1];
			if ( m_Size > 2 )
			{
				val <<= 8;
				val |= (int)m_pBits[2];
				if ( m_Size > 3 )
				{
					val <<= 8;
					val |= (int)m_pBits[3];
				}
			}
		}
	}

	r = (val>>m_RShift) & m_RMask;
	g = (val>>m_GShift) & m_GMask;
	b = (val>>m_BShift) & m_BMask;
	a = (val>>m_AShift) & m_AMask;
}

#endif // PIXELWRITER_H;