csgo/cstrike15_src/materialsystem/ps3gcm/spugcm.cpp

//========== Copyright � Valve Corporation, All rights reserved. ========
// This is the central hub for controlling SPU activities relating to 
// RSX/graphics processing/rendering
//
#include "spugcm.h"
#include "ps3/ps3gcmmemory.h"
#include "fpcpatcher_spu.h"
#include "ps3gcmstate.h"
#include "vjobs/root.h"
#include "ps3/ps3gcmlabels.h"
#include "ps3/vjobutils_shared.h"
#include "vjobs/jobparams_shared.h"
#include "vjobs/ibmarkup_shared.h"
#include "inputsystem/iinputsystem.h"
#include <sysutil/sysutil_common.h>
#include <sysutil/sysutil_sysparam.h>
#include <cell/pad.h>
#include <materialsystem/imaterialsystem.h>
#include "fpcpatcher_spu.h"
#include "dxabstract.h"
#include "rsxflip.h"

extern IVJobs * g_pVJobs;
CSpuGcmSharedState g_spuGcmShared; 

CSpuGcm g_spuGcm;
static int s_nFinishLabelValue = 0, s_nStopAtFinishLabelValue = -1;
CEdgeGeomRing g_edgeGeomRing;
ApplicationInstantCountersInfo_t g_aici;
CEdgePostWorkload g_edgePostWorkload;

#define PCB_RING_CTX ( *gCellGcmCurrentContext )

void FillNops( struct CellGcmContextData *context )
{
	while( context->current < context->end )
		*( context->current++ ) = CELL_GCM_METHOD_NOP;
}

int32_t SpuGcmCommandBufferReserveCallback( struct CellGcmContextData *context, uint32_t nCount )
{
	return g_spuGcm.OnGcmCommandBufferReserveCallback( context, nCount );
}

void SpuGcmDebugFinish( CellGcmContextData *thisContext )
{
	Assert( thisContext == &PCB_RING_CTX );
	g_spuGcm.CmdBufferFinish();
}


void StallAndWarning( const char * pWarning )
{
	sys_timer_usleep( 30 );
	if( g_spuGcmShared.m_enableStallWarnings )
	{
		Warning( "Stall: %s\n", pWarning );
	}
}


//#endif

void CSpuGcm::CreateRsxBuffers()
{
	//////////////////////////////////////////////////////////////////////////
	// Create Fragment program patch buffers
	//
	uint nFpcpRingBufferSize = CalculateMemorySizeFromCmdLineParam( "-fpcpRingSize", 512 * 1024, 32 * 1024 );
	Msg("Fpcp ring size: %d bytes \n", nFpcpRingBufferSize );
	m_fpcpRingBuffer.Alloc( kAllocPs3GcmShader, nFpcpRingBufferSize );
	g_spuGcmShared.m_fpcpRing.SetRsxBuffer( m_fpcpRingBuffer.DataInLocalMemory(), nFpcpRingBufferSize, nFpcpRingBufferSize / 4, nFpcpRingBufferSize / 4096 ); 
	uint nEdgeRingBufferSize = CalculateMemorySizeFromCmdLineParam( "-edgeRingSize", 2 * 1024 * 1024, 1536 * 1024 );
	Msg("Edge ring size: %d bytes\n", nEdgeRingBufferSize );
	m_edgeGeomRingBuffer.Alloc( kAllocPs3GcmEdgeGeomBuffer, nEdgeRingBufferSize );
	if( nEdgeRingBufferSize < 8 * EDGEGEOMRING_MAX_ALLOCATION )
	{
		Error( "EdgeGeom has ring buffer that won't fit 8 jobs, which is a minimum. %u ( %u ) < 8 * %u\n", nEdgeRingBufferSize, m_edgeGeomRingBuffer.Size(), EDGEGEOMRING_MAX_ALLOCATION );
	}
	if( nEdgeRingBufferSize < 6 * 8 * EDGEGEOMRING_MAX_ALLOCATION )
	{
		Warning( "EdgeGeom has ring buffer that may block job_edgegeom performance. %u ( %u ) < 6 SPUs * 8 segments * %u\n", nEdgeRingBufferSize, m_edgeGeomRingBuffer.Size(), EDGEGEOMRING_MAX_ALLOCATION );
	}
}


const vec_uint4 g_vuSpuGcmCookie = (vec_uint4){0x04291978,0xC00CC1EE,0x04291978,0xC00CC1EE};
void CSpuGcm::CreateIoBuffers()
{
	const uint nCmdBufferOverfetchSlack = 1024;
	uint nFpRingIoBufferSize = 16 * 1024;
	uint nFpRingBufferSize = Max( nFpRingIoBufferSize, nCmdBufferOverfetchSlack ); // this buffer is RSX-write-only, at the end of mapped memory, it acts as an overfetch slack, too, so it must be at least the size of the slack
	g_spuGcmShared.m_fpcpRing.SetIoBuffer( g_ps3gcmGlobalState.IoMemoryPrealloc( nFpRingIoBufferSize, nFpRingBufferSize ), nFpRingIoBufferSize );
	
	m_pMlaaBufferCookie = NULL;
	m_pMlaaBuffer = NULL;
	m_pMlaaBufferOut = NULL;
	m_pEdgePostRsxLock = NULL;
	
	if( !CommandLine()->FindParm( "-noMlaa" ) )
	//if( CommandLine()->FindParm( "-edgeMlaa" ) )
	{
		uint nSizeofEdgePostBuffer = g_ps3gcmGlobalState.GetRenderSurfaceBytes( 128 );
		m_pMlaaBuffer = g_ps3gcmGlobalState.IoMemoryPrealloc( 128, nSizeofEdgePostBuffer + sizeof( g_vuSpuGcmCookie ) + sizeof( uint32 ) * CPs3gcmDisplay::SURFACE_COUNT );
		if( m_pMlaaBuffer )
		{
			m_pMlaaBufferOut = m_pMlaaBuffer;//( void* )( uintp( m_pMlaaBuffer ) + nSizeofEdgePostBuffer );
			m_pMlaaBufferCookie = ( vec_uint4* ) ( uintp( m_pMlaaBufferOut ) + nSizeofEdgePostBuffer );
			*m_pMlaaBufferCookie = g_vuSpuGcmCookie;
			m_pEdgePostRsxLock = ( uint32* )( m_pMlaaBufferCookie + 1 );
		}
		else
		{
			// if MlaaBuffer is NULL, it just means we're in the pass of computing the IO memory requirements
		}
	}
}


//
// memory optimization: IO memory has slack, use it if it's big enough
//
void CSpuGcm::UseIoBufferSlack( uint nIoBufferSlac )
{
	uint nSpuDrawQueueSize = CalculateMemorySizeFromCmdLineParam( "-spuDrawRingSize", 512 * 1024, 32 * 1024 );
	Msg( "SPU draw queue size: %d Kb\n" , nSpuDrawQueueSize / 1024 );
	uint nSpuDrawQueueDeferredSize = CalculateMemorySizeFromCmdLineParam( "-spuDrawDeferredRingSize", 210 * 1024, 32 * 1024 );
	Msg( "SPU draw deferred queue size: %d Kb\n" , nSpuDrawQueueDeferredSize / 1024 );

	m_nSpuDrawQueueSelector = 0;
	m_spuDrawQueues[0].Init( nSpuDrawQueueSize, &g_spuGcmShared.m_nSpuDrawGet[0], OnSpuDrawQueueFlush, OnSpuDrawQueueStall );
	m_spuDrawQueues[1].Init( nSpuDrawQueueDeferredSize, &g_spuGcmShared.m_nSpuDrawGet[1], OnSpuDrawQueueFlushDeferred, OnSpuDrawQueueStallDeferredDelegator );

	for( uint i = 0; i < ARRAYSIZE( m_pDeferredStates ); ++i )
		m_pDeferredStates[i] = ( DeferredState_t * ) g_ps3gcmGlobalState.IoSlackAlloc( 128, sizeof( DeferredState_t ) );

	for( uint i = 0; i < ARRAYSIZE( m_pDeferredQueueCursors ); ++i )
		m_pDeferredQueueCursors[i] = m_spuDrawQueues[1].GetCursor();
	m_pDeferredQueueSegment = m_pDeferredQueueCursors[0];
	m_pDeferredChunkSubmittedTill[1] = m_spuDrawQueues[1].GetCursor();
	for( uint i = 0; i < ARRAYSIZE( m_spuDrawQueues ); ++i )
		g_spuGcmShared.m_nSpuDrawGet[i] = m_spuDrawQueues[i].GetSignal();
}


static fltx4 g_vertexProgramConstants[CELL_GCM_VTXPRG_MAX_CONST];
// static uint s_nLastCtxBufferCookie = 0;
// static uint s_nCtxBufferSegmentSubmitTime = 0;  // divide by 2 and it'll be the weighted average of 79.8MHz ticks between segment submissions

void CSpuGcm::OnGcmInit()
{
	if( 127 & uintp( g_ps3gcmGlobalState.m_pLocalBaseAddress ) )
	{
		Error( "Local addresses map to main memory without proper 128-byte alignment! Some DMA assumptions are broken!!\n" );
	}
	if( 127 & uintp( g_ps3gcmGlobalState.m_nIoOffsetDelta ) )
	{
		Error( "IO addresses map to local memory without proper 128-byte alignment! Some DMA assumptions are broken!!\n" );
	}
	V_memset( &g_spuGcmShared.m_dxGcmState, 0, sizeof( g_spuGcmShared.m_dxGcmState ) );
	V_memset( &g_spuGcmShared.m_cachedRenderState, 0, sizeof( g_spuGcmShared.m_cachedRenderState ) );
	
	m_nPcbringWaitSpins = 0;
	m_pPcbringBuffer = NULL;
	m_eaLastJobThatUpdatesSharedState = 0;
	g_spuGcmShared.m_enableStallWarnings = ( CommandLine()->FindParm( "-enableStallWarnings" ) != 0 );
	
	g_spuGcmShared.m_edgeGeomFeeder.Init( m_edgeGeomRingBuffer.Size() );
	g_edgeGeomRing.Init( m_edgeGeomRingBuffer.DataInLocalMemory(), m_edgeGeomRingBuffer.Size(), g_ps3gcmGlobalState.m_nIoOffsetDelta, g_ps3gcmGlobalState.m_pLocalBaseAddress, GCM_LABEL_EDGEGEOMRING );
	g_spuGcmShared.m_eaEdgeGeomRing = &g_edgeGeomRing;

	g_spuGcmShared.m_fpcpRing.OnGcmInit( g_ps3gcmGlobalState.m_nIoOffsetDelta );
	g_spuGcmShared.m_nDrawLayerBits = g_spuGcmShared.LAYER_RENDER;
	g_spuGcmShared.m_nDrawLayerPredicates = g_spuGcmShared.LAYER_RENDER_AND_Z;
	
	g_spuGcmShared.m_nLastRsxInterruptValue = 0;
	
	if( m_pEdgePostRsxLock )
	{
		for( uint i = 0; i < CPs3gcmDisplay::SURFACE_COUNT; ++i )
		{
			m_pEdgePostRsxLock[i] = CELL_GCM_RETURN(); // assume previous flips already happened			
		}
	}

	g_pVJobs->Register( this );
	
	m_zPass.Init();
	m_bUseDeferredDrawQueue = true;
	BeginGcmStateTransaction();
	g_pixelShaderPatcher.InitLocal( g_spuGcmShared.m_fpcpRing.GetRsxBuffer(), g_spuGcmShared.m_fpcpRing.GetRsxBufferSize() );
	g_spuGcmShared.m_eaFpcpSharedState              = g_pixelShaderPatcher.m_state.m_pSharedState;
	g_spuGcmShared.m_nFpcpBufferMask                = g_spuGcmShared.m_eaFpcpSharedState->m_nBufferMask;
	g_spuGcmShared.m_eaLocalBaseAddress				= (uint32)g_ps3gcmGlobalState.m_pLocalBaseAddress;
	g_spuGcmShared.m_cachedRenderState.m_nDisabledSamplers				= 0;
	g_spuGcmShared.m_cachedRenderState.m_nSetTransformBranchBits		= 0;
	g_spuGcmShared.m_nDebuggerRunMask				= SPUGCM_DEBUG_MODE ? 2 : 0;
	g_spuGcmShared.m_eaLastJobThatUpdatedMe			= 0;
	g_spuGcmShared.m_nFpcPatchCounterOfLastSyncJob	= g_pixelShaderPatcher.m_nFpcPatchCounterOfLastSyncJob;
	g_spuGcmShared.m_nFpcPatchCounter				= g_pixelShaderPatcher.m_nFpcPatchCounter;
	g_spuGcmShared.m_nFpcpStartRangesAfterLastSync  = g_spuGcmShared.m_eaFpcpSharedState->m_nStartRanges;
	g_spuGcmShared.m_eaZPassSavedState               = NULL;
	
	g_spuGcmShared.m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine = g_ps3gcmGlobalState.m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine;
	g_spuGcmShared.m_eaPs3texFormats				= g_ps3texFormats;
	
	g_spuGcmShared.m_eaVertexProgramConstants = g_vertexProgramConstants;
	
	m_nGcmFlushJobScratchSize = 0;
	m_nFrame = 0;

	// we shouldn't have used this format yet
	Assert( g_ps3texFormats[PS3_TEX_CANONICAL_FORMAT_COUNT].m_gcmPitchPer4X == 0 );
	Assert( g_ps3texFormats[PS3_TEX_CANONICAL_FORMAT_COUNT-1].m_gcmPitchPer4X != 0 );
	Assert( !( 0xF & uintp( g_spuGcmShared.m_eaPs3texFormats ) ) );
	
	Assert( g_spuGcmShared.m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine );
	COMPILE_TIME_ASSERT( !GCM_CTX_UNSAFE_MODE );
	{
		m_pFinishLabel = cellGcmGetLabelAddress( GCM_LABEL_SPUGCM_FINISH );
		*m_pFinishLabel = s_nFinishLabelValue;

		uint nSysringBytes = g_ps3gcmGlobalState.m_nCmdSize - SYSTEM_CMD_BUFFER_RESERVED_AREA - 16 - sizeof( SysringWrapSequence::Tail_t ); // 16 bytes for the JTN to wrap the buffer around, and to be able to DMA it in 16-byte chunks
		nSysringBytes &= -16; // make it 16-byte aligned
		uint eaSysringBuffer = uintp( g_ps3gcmGlobalState.m_pIoAddress ) + SYSTEM_CMD_BUFFER_RESERVED_AREA;
		uint32 * pSysringBufferEnd = ( uint32* )( eaSysringBuffer + nSysringBytes );
		*pSysringBufferEnd =  // this is not strictly needed...
		g_spuGcmShared.m_sysringWrap.m_tail.m_nJumpToBegin                   = CELL_GCM_JUMP( SYSTEM_CMD_BUFFER_RESERVED_AREA ); 
		V_memset( g_spuGcmShared.m_sysringWrap.m_tail.m_nNops, 0, sizeof( g_spuGcmShared.m_sysringWrap.m_tail.m_nNops ) );
		Assert( !( 0xF & uint( &g_spuGcmShared.m_sysringWrap ) ) );
		
		
		//COMPILE_TIME_ASSERT( SPUGCM_USE_SET_REFERENCE_FOR_SYSRING_SIGNAL );
		//g_spuGcmShared.m_pEaSysringEndLabel = ( uint32* ) cellGcmGetLabelAddress( GCM_LABEL_SYSRING_END );
		//*g_spuGcmShared.m_pEaSysringEndLabel = g_spuGcmShared.m_sysring.m_nEnd; // pretend we finished all processing
		
		//g_spuGcmShared.m_nSysringSegmentWords = ( g_ps3gcmGlobalState.m_nCmdSize - nSysringCmdBufferSystemArea ) / sizeof( uint32 ) / g_spuGcmShared.NUM_SYSTEM_SEGMENTS;
		//g_spuGcmShared.m_nSysringSegmentWords &= -16; // make it aligned, at least -4 words but may be more for easier debugging (more round numbers)
		g_spuGcmShared.m_nIoOffsetDelta       = g_ps3gcmGlobalState.m_nIoOffsetDelta;
		g_spuGcmShared.m_nSysringWaitSpins    = 0;
		g_spuGcmShared.m_nSysringPuts         = 0;
		g_spuGcmShared.m_nSysringSegmentSizeLog2  = 29 - __cntlzw( g_ps3gcmGlobalState.m_nCmdSize ); // make 4 subsegments; guarantee segment switch whenever the ring wraps around
		// we need AT LEAST 2 segments and each segment must be AT LEAST 1kb - for performant and reliable operation; 
		Assert( ( g_ps3gcmGlobalState.m_nCmdSize >> g_spuGcmShared.m_nSysringSegmentSizeLog2 ) > 2 && ( g_ps3gcmGlobalState.m_nCmdSize >> g_spuGcmShared.m_nSysringSegmentSizeLog2 ) < 8  && g_spuGcmShared.m_nSysringSegmentSizeLog2 >= 10 );
		//g_spuGcmShared.m_nSysringPut          = 0;
		//g_spuGcmShared.m_nSysringEnd          = g_spuGcmShared.NUM_SYSTEM_SEGMENTS; // pretend we got the whole buffer already
		g_spuGcmShared.m_nDebuggerBreakMask     = 0x00000000;
		g_spuGcmShared.m_nDebugLastSeenGet      = 0xFEFEFEFE;

		uint nPcbringSize = SPUGCM_DEFAULT_PCBRING_SIZE;
		COMPILE_TIME_ASSERT( !( SPUGCM_DEFAULT_PCBRING_SIZE & ( SPUGCM_DEFAULT_PCBRING_SIZE - 1 ) ) );
		g_spuGcmShared.m_nPcbringSize = nPcbringSize ;
		// 12 extra bytes are allocated for buffer alignment code to avoid writing past end of the buffer ; 4 more bytes are for the cookie
		//m_pPcbringBuffer = ( uint32 * )MemAlloc_AllocAligned( nPcbringSize + 12 + 4, 0x10 );
		//*AddBytes( m_pPcbringBuffer, g_spuGcmShared.m_nPcbringSize + 12 ) = 0x1234ABCD;
		m_nPcbringBegin = 0;
		g_spuGcmShared.m_nPcbringEnd = g_spuGcmShared.m_nPcbringSize; // consider the full ring buffer already processed on SPU and free: this End is the end of "free to use" area
		// these is the max count of words needed to align the cmd buffer and insert any write-labels/set-reference-values
		// we need to add at least 3 to the count, in case we align current pointer in the process ( because we may need to submit )
		// also, we want this segment size to fit inside the between-segment signal
		m_nMaxPcbringSegmentBytes = Min<uint>( ( ( nPcbringSize - 32 - SPUGCM_SIZEOF_SYSRING_ENDOFSEGMENT_SIGNAL_COMMAND ) / 4 ) & -16, ( 1 << g_spuGcmShared.m_nSysringSegmentSizeLog2 ) - SPUGCM_SIZEOF_SYSRING_ENDOFSEGMENT_SIGNAL_COMMAND - 12 ); // 
		// we definitely need PCBring segment to fit well into local store
		m_nMaxPcbringSegmentBytes = Min<uint>( m_nMaxPcbringSegmentBytes, SPUGCM_LSRING_SIZE / 2 );
		m_nMaxPcbringSegmentBytes = Min<uint>( m_nMaxPcbringSegmentBytes, SPUGCM_MAX_PCBRING_SEGMENT_SIZE );
		m_nMaxPcbringSegmentBytes &= -16; // make it 16-byte aligned..

		cellGcmReserveMethodSize( gCellGcmCurrentContext, 3 ); // we need at most ( 2 words for reference command + ) 3 words for alignment
		
		// align the buffer on 16-byte boundary, because we manage it in 16-byte increments
		while( 0xF & uintp( gCellGcmCurrentContext->current ) )
		{
			*( gCellGcmCurrentContext->current++ ) = CELL_GCM_METHOD_NOP;
		}
		
		g_spuGcmShared.m_sysring.Init( eaSysringBuffer, nSysringBytes, uint( gCellGcmCurrentContext->current ) - eaSysringBuffer );
		g_spuGcmShared.m_sysringRo.Init( GCM_LABEL_SYSRING_SIGNAL );
		g_spuGcmShared.m_nSysringWrapCounter = 0;
		g_spuGcmShared.m_eaGcmControlRegister = cellGcmGetControlRegister();
		g_spuGcmShared.m_eaSysringLabel = cellGcmGetLabelAddress( GCM_LABEL_SYSRING_SIGNAL );
		g_spuGcmShared.m_eaDebugLabel[0] = cellGcmGetLabelAddress( GCM_LABEL_DEBUG0 );
		g_spuGcmShared.m_eaDebugLabel[1] = cellGcmGetLabelAddress( GCM_LABEL_DEBUG1 );
		g_spuGcmShared.m_eaDebugLabel[2] = cellGcmGetLabelAddress( GCM_LABEL_DEBUG2 );
		*g_spuGcmShared.m_eaSysringLabel = g_spuGcmShared.m_sysring.GetSignal(); // pretend we executed WriteLabel
		g_spuGcmShared.m_nLastSignal            = g_spuGcmShared.m_sysring.GetInvalidSignal();
	#if SPU_GCM_DEBUG_TRACE
		g_spuGcmShared.m_nDebugTraceBufferNext  = 0;
		g_spuGcmShared.m_eaDebugTraceBuffer     = ( SpuGcmDebugTrace_t* )MemAlloc_AllocAligned( g_spuGcmShared.DEBUG_BUFFER_COUNT * sizeof( SpuGcmDebugTrace_t ), 16 );
	#endif
		if( SPUGCM_USE_SET_REFERENCE_FOR_SYSRING_SIGNAL )
		{
			g_spuGcmShared.m_eaGcmControlRegister->ref = g_spuGcmShared.m_sysring.m_nEnd;// pretend we finished all processing
		}
		
	#ifdef _DEBUG
		m_nJobsPushed = 0;
		// fill in JTS in the rest of the buffer
		for( uint32 * pSlack = gCellGcmCurrentContext->current; pSlack < pSysringBufferEnd; ++pSlack )
			*pSlack = CELL_GCM_JUMP( uintp( pSlack ) - uintp( g_ps3gcmGlobalState.m_pIoAddress ) );
	#endif
		// set reference BEFORE we switch to sysring
		uint nGcmPut = uintp( gCellGcmCurrentContext->current ) + g_spuGcmShared.m_nIoOffsetDelta;
		Assert( !( 0xF & nGcmPut ) );
		__sync();
		g_spuGcmShared.m_eaGcmControlRegister->put = nGcmPut;
		// wait for RSX to reach this point, then switch to the new command buffer scheme
		int nAttempts = 0;
		while( g_spuGcmShared.m_eaGcmControlRegister->get != nGcmPut )
		{
			sys_timer_usleep(1000);
			if( ++nAttempts > 1000 )
			{
				Warning( "Cannot properly wait for RSX in OnGcmInit(%X!=%X); assuming everything's all right anyway.\n", g_spuGcmShared.m_eaGcmControlRegister->get, nGcmPut );
				break; // don't wait forever..
			}
		}

		//////////////////////////////////////////////////////////////////////////
		// Switch to PPU Command Buffer RING 
		//
		// set reference BEFORE we switch to sysring; wait for all RSX initialization to go through before switching
		PCB_RING_CTX.begin = PCB_RING_CTX.current = NULL;//m_pPcbringBuffer;
		// we need to at least double-buffer to avoid deadlocks while waiting to submit a Pcbring segment
		// Each segment ends with a reference value update, and we need that update to unblock a piece of memory for use by subsequent submits
		Assert( GetMaxPcbringSegmentBytes() <= nPcbringSize / 2 ); 
		PCB_RING_CTX.end = NULL;//AddBytes( m_pPcbringBuffer, GetMaxPcbringSegmentBytes() );
		PCB_RING_CTX.callback = SpuGcmCommandBufferReserveCallback;

	#ifdef CELL_GCM_DEBUG // [
		gCellGcmDebugCallback = SpuGcmDebugFinish;
		cellGcmDebugCheckEnable( CELL_GCM_TRUE );
	#endif // ]
	}
}


inline signed int CSpuGcm::GetPcbringAvailableBytes()const
{
	int nReallyAvailable = int32( *(volatile uint32*)&g_spuGcmShared.m_nPcbringEnd ) - int32( m_nPcbringBegin );
	#ifdef DBGFLAG_ASSERT
	Assert( uint( nReallyAvailable ) <= g_spuGcmShared.m_nPcbringSize );
	static int s_nLastPcbringAvailableBytes = -1;
	s_nLastPcbringAvailableBytes = nReallyAvailable;
	#endif
	Assert( nReallyAvailable >= 0 );
	return nReallyAvailable;
}


int CSpuGcm::OnGcmCommandBufferReserveCallback( struct CellGcmContextData *context, uint32_t nReserveCount )
{
	FillNops(context);
	// IMPORTANT: we only allocate the necessary number of words here, no more no less
	//            if we over-allocate, we may end up reordering commands in SPU draw queue following after GCM_FUNC commands
	uint nReserve = nReserveCount; 
	uint32 * pDrawQueueCommand = GetDrawQueue()->AllocWords( nReserve + 1 );
	*pDrawQueueCommand = SPUDRAWQUEUE_GCMCOMMANDS_METHOD | nReserve;
	context->begin = context->current = pDrawQueueCommand + 1;
	context->end = context->begin + nReserve;
	if( IsDebug() )
		V_memset( context->current, 0xFE, nReserve * 4 );
	return CELL_OK;
}


void CSpuGcm::BeginGcmStateTransaction()
{
	m_nCurrentBatch = BATCH_GCMSTATE;
	SetCurrentBatchCursor( GetDrawQueue()->GetCursor() );
}


void CSpuGcm::PushStateFlushJob( SpuDrawQueue * pDrawQueue, uint nResultantSpuDrawQueueSignal, uint32 *pCursorBegin, uint32 * pCursorEnd )
{
	// only submit the job if there are any commands in the state command buffer
	CellSpursJob128 * pJob = m_jobPool128.Alloc( *m_pRoot->m_pGcmStateFlush );
	job_gcmstateflush::JobParams_t * pJobParams = job_gcmstateflush::GetJobParams( pJob );
	pJob->header.useInOutBuffer = 1;
	CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
	dmaConstructor.AddInputDma( sizeof( g_spuGcmShared ), &g_spuGcmShared );				 // dma[0]; must be the first to be 128-byte aligned for atomics
	uint nSizeofDrawQueueUploadBytes = pDrawQueue->Collect( pCursorBegin, pCursorEnd, dmaConstructor );
	Assert( !( nSizeofDrawQueueUploadBytes & 3 ) );
	
	dmaConstructor.AddSizeInOrInOut( 48 + SPUGCM_LSRING_SIZE ); // 16 bytes for alignment; 16 for lsZero; 16 for lsTemp;
	COMPILE_TIME_ASSERT( sizeof( CPs3gcmTextureLayout::Format_t ) == 16 );
	dmaConstructor.AddCacheDma( g_nPs3texFormatCount * sizeof( CPs3gcmTextureLayout::Format_t ), g_ps3texFormats )	;
	dmaConstructor.FinishIoBuffer( &pJob->header, pJobParams );

	pJobParams->m_nSkipDrawQueueWords = ( uintp( pCursorBegin ) / sizeof( uint32 ) ) & 3;
	pJobParams->m_nSizeofDrawQueueUploadWords = nSizeofDrawQueueUploadBytes / sizeof( uint32 ) ;
	Assert( uint( pJobParams->m_nSizeofDrawQueueUploadWords ) == nSizeofDrawQueueUploadBytes / sizeof( uint32 ) ); // make sure it fits into uint16
	pJobParams->m_nSpuDrawQueueSignal = nResultantSpuDrawQueueSignal;

	#ifdef DBGFLAG_ASSERT
	SpuDrawQueue * pSignalDrawQueue = &m_spuDrawQueues[ nResultantSpuDrawQueueSignal & 3 ? 1 : 0 ]; (void)pSignalDrawQueue;
	Assert( pSignalDrawQueue->IsValidCursor( (uint32*)( nResultantSpuDrawQueueSignal & ~3 ) ) );
	#endif
	uint nResultantSpuDrawQueueIndex = nResultantSpuDrawQueueSignal & 3;
	m_pDeferredChunkSubmittedTill[ nResultantSpuDrawQueueIndex ] = ( uint32* )( nResultantSpuDrawQueueSignal & ~3 );


	Assert( CELL_OK == cellSpursCheckJob( (const CellSpursJob256 *)pJob, sizeof( *pJob ), 256 ) );
	m_eaLastJobThatUpdatesSharedState = ( uintp )pJob;

	pJob->header.sizeScratch = m_nGcmFlushJobScratchSize;
	m_nGcmFlushJobScratchSize = 0;
	PushSpuGcmJob( pJob );

	if( SPUGCM_DEBUG_MODE )
	{
		// in SPUGCM_DEBUG_MODE, we execute all jobs and wait for them to complete. So, the GET pointer should always trail our pNext pointer
		Assert( g_spuGcmShared.m_nSpuDrawGet[nResultantSpuDrawQueueIndex] == ( nResultantSpuDrawQueueSignal & ~3 ) );
	}
}


void CSpuGcm::GcmStateFlush( )
{
	Assert( m_nCurrentBatch == BATCH_GCMSTATE );
	if( IsDeferredDrawQueue() )
	{
		Warning( "Unexpected Flush in deferred spu draw queue\n" );
		OpenDeferredChunk();
	}
	else
	{
		if( GetCurrentBatchCursor() != GetDrawQueue()->GetCursor() )
		{
			FillNops( &PCB_RING_CTX );
			Assert( GetDrawQueue() == &m_spuDrawQueues[0] );
			PushStateFlushJob( &m_spuDrawQueues[0], m_spuDrawQueues[0].GetSignal(), GetCurrentBatchCursor(), GetDrawQueue()->GetCursor() );

			BeginGcmStateTransaction();
			ZPassCheckpoint( 6 );
		}
	}
}


void CSpuGcm::PushSpuGcmJob( CellSpursJob128 * pJob )
{
#ifdef _DEBUG
	m_nJobsPushed++;
#endif
	PushSpuGcmJobCommand( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
	if( SPUGCM_DEBUG_MODE )
	{
		if( !m_zPass )
		{
			// in ZPass_Z the job doesn't free its descriptor
			// in ZPass_Render, we don't start the jobs through here
			// so we can't use this spin-wait to wait for the job to complete
			while( *( volatile uint64* )&pJob->header.eaBinary )
			{
				sys_timer_usleep( 60 );
			}
		}
	
		while( g_spuGcmShared.m_eaLastJobThatUpdatedMe != uintp( pJob ) )
		{
			sys_timer_usleep( 60 );
		}
	}
}


void CSpuGcm::PushSpuGcmJobCommand( uint64 nCommand )
{
	if( m_zPass )
	{
		m_zPass.PushCommand( nCommand );
	}
	else
	{
		m_jobSink.PushSyncJobSync( nCommand );
	}
}


void CSpuGcm::ZPassCheckpoint( uint nReserveSlots )
{
	if( m_zPass )
	{
		uint nFreeSubchainSlots = m_zPass.GetSubchainCapacity();
		if( nFreeSubchainSlots < 2 * nReserveSlots )
		{
			ExecuteOnce( Warning("Aborting Z prepass: not enough room for commands in zpass sub-job-chain (%d left).\n", nFreeSubchainSlots ) );
			AbortZPass(); // initiate Abort sequence of ZPass; reentrant
		}
		uint nFreeJobDescriptors = m_jobPool128.GetReserve( m_zPass.m_nJobPoolMarker );
		if( nFreeJobDescriptors < nReserveSlots )
		{
			ExecuteOnce( Warning("Aborting Z prepass: not enough room for job descriptors in m_jobPool128 (%d left)\n", nFreeJobDescriptors ) );
			AbortZPass();
		}
	}
}


void CSpuGcm::OnSetPixelShaderConstant()
{
	Assert( !IsDeferredDrawQueue() );
	if( m_zPass )
	{
		if( !m_zPass.m_isInEndZPass )
		{
			if( g_pixelShaderPatcher.GetJournalSpaceLeftSince( m_zPass.m_nFpcpStateEndOfJournalIdxAtZPassBegin ) < 512 )
			{
				ExecuteOnce( Warning( "Performance Warning: Too many pixel shader constants set inside ZPass; aborting ZPass\n" ) );
				AbortZPass();
			}
		}
	}
	else
	{
		// we have space for 48kB (3k of constants) in FPCP; 
		// every SetPixelShaderConstant may add 97 constants (96 values, 1 header)
		if( g_pixelShaderPatcher.GetJournalSpaceUsedSince( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob ) > ( 32*1024 / 16 ) || g_pixelShaderPatcher.GetJournalSpaceLeftSince( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob ) < 512 )
		{
			ExecuteOnce( Warning("Performance Warning: SetPixelShaderConstantF called for %d constants, but no draw calls were issued. Flushing FPCP state.\n", g_pixelShaderPatcher.GetJournalSpaceUsedSince( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob ) ) );
			// flush GCM with only one purpose: make it flush the patcher
			GetDrawQueue()->Push2( SPUDRAWQUEUE_FLUSH_FPCP_JOURNAL, g_pixelShaderPatcher.GetStateEndOfJournalIdx() );
			GcmStateFlush();
		}
	}
}

void CSpuGcm::OnSpuDrawQueueStallDeferredDelegator( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint nWords )
{
	g_spuGcm.OnSpuDrawQueueStallDeferred( pDrawQueue, pGet, nWords );
}

void CSpuGcm::OnSpuDrawQueueStallDeferred( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint nWords )
{
	// we need to try to wait for the previous deferred batch to finish
	// in any case we should be prepared for "out of space" condition
	// in which case we'll just execute all deferred commands right now
	if( pGet == m_pDeferredChunkSubmittedTill[1] )
	{
		// we have nothing else to wait for, we need to free the space by executing deferred commands now
		 // full flush (this frame only, since previous frame was flushed the first time we called DrawQueueDeferred()
		FillNops( &PCB_RING_CTX ); // switching draw queues, preallocated gcm context no longer usable
		
		// the only deferred chunk that can resize is GCMFLUSH
		// and handling it is pretty easy: we can either execute whatever it collected so far
		if( m_pDeferredChunkHead )
		{
			// sanity check: we shouldn't have chunks as big as 64KB
			Assert( m_spuDrawQueues[1].Length( m_pCurrentBatchCursor[1], m_pDeferredChunkHead ) <= 64*1024 );
			Assert( *m_pDeferredChunkHead == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD && m_pDeferredChunkHead == m_pDeferredQueueCursors[0] );
		}

		// temporarily switch to normal queue state in order to replay the deferred queue commands and purge them
		uint32 * pDeferredQueueSegment = m_pDeferredQueueSegment;
		m_nSpuDrawQueueSelector = 0;
		Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );
		BeginGcmStateTransaction(); // this transaction is beginning in Normal draw queue; Deferred queue is currently in "frozen" state (almost out of memory)

		g_flipHandler.QmsAdviceBeforeDrawPrevFramebuffer();
		// flush previous frame first, and if it doesn't change Get , flush this frame
		ExecuteDeferredDrawQueue( 1 );
			extern void DxDeviceForceUpdateRenderTarget( );
			DxDeviceForceUpdateRenderTarget( ); // recover main render target, as it was screwed up by execution of previous frame's commands
		ExecuteDeferredDrawQueue( 0 );
		m_nFramesToDisableDeferredQueue = 1;

		// return to the deferred state after purging the queue. During purging the deferred queue, DrawQueue(Normal|Deferred) could not have been called
		// this "unfreezes" the deferred queue, which should by now be almost-all-free(	or pending, depending on how fast SPUs will chew through it)
		Assert( m_pDeferredQueueSegment == pDeferredQueueSegment );
		
		// we executed up to this point (last opened chunk), we discard everything before it.
		// the last opened chunk is perfectly fine to begin the queue segment, so we pretend we began deferred queue there
		m_pDeferredQueueSegment = m_pDeferredQueueCursors[0];
		
		m_nSpuDrawQueueSelector = 1;
	}
}


void CSpuGcm::OnSpuDrawQueueFlushDeferred( SpuDrawQueue *pDrawQueue )
{
	// break up long GCM chunks
	Assert( pDrawQueue == g_spuGcm.GetDrawQueue() ); 
	Assert( !g_spuGcm.m_pDeferredChunkHead || ( *g_spuGcm.m_pDeferredChunkHead & ~SPUDRAWQUEUE_DEFERRED_GCMFLUSH_MASK ) == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD ); // this is the only chunk we allocate incrementally
	
	// prevent this from being called recursively: reset flush watermark before doing anything else
	pDrawQueue->SetFlushWatermarkFrom( pDrawQueue->GetCursor() );
		
	g_spuGcm.OpenDeferredChunk();
}

void CSpuGcm::OnSpuDrawQueueStall( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint32 nWords )
{
	Assert( pDrawQueue == &g_spuGcm.m_spuDrawQueues[0] );
	StallAndWarning( "SpuDrawQueue stall: PPU is waiting for SPU, and SPU is probably waiting for RSX\n"/*, nWords, pGet, g_spuGcm.m_spuDrawQueues[0].GetCursor()*/ );
}

void CSpuGcm::OnSpuDrawQueueFlush( SpuDrawQueue *pDrawQueue )
{
	// currently, there's only one and it's 
	Assert( pDrawQueue == g_spuGcm.GetDrawQueue() ); 
	g_spuGcm.GcmStateFlush();
}


void CSpuGcm::OnSpuDrawQueueFlushInZPass()
{
	//
	// flush watermark has changed now (it changes on every collect())
	// override flush watermark to flush before we reach ZPass cursor, 
	// and if it's impossible, then Abort ZPass - we don't have enough space
	// in SPU GCM buffer
	//
	// Take care not to flush excessively when pusing the last few commands into 
	// SPUGCM draw buffer because we can be doing that right around flush watermark
	// frequently
	//
	
	uint32 * pOldFlushWatermark = GetDrawQueue()->GetFlushWatermark();

	GcmStateFlush();

	uint32 * pNewFlushWatermark = GetDrawQueue()->GetFlushWatermark();
	
	if( pNewFlushWatermark < pOldFlushWatermark ? pNewFlushWatermark >= m_zPass.m_pCursor || pOldFlushWatermark <= m_zPass.m_pCursor : pOldFlushWatermark <= m_zPass.m_pCursor && m_zPass.m_pCursor <= pNewFlushWatermark )
	{
		// the next flush will be too late; 
		// NOTE: we can recover up to 32KB by adjusting the flush watermark here, but I have bigger fish to fry, so we'll just abort ZPass right now and here 
		AbortZPass();
	}
}

void CSpuGcm::OnSpuDrawQueueFlushInZPass( SpuDrawQueue *pDrawQueue )
{
	// TODO: check if cursor is intersected and potentially EndZPass()
	Assert( pDrawQueue == g_spuGcm.GetDrawQueue() ); 
	g_spuGcm.OnSpuDrawQueueFlushInZPass();
}

void SpuGcmCommandBufferFlush()
{
	g_spuGcm.CmdBufferFlush();
}


SpuDrawHeader_t * CSpuGcm::BeginDrawBatch()
{
	SpuDrawHeader_t * pDrawHeader;
	if( IsDeferredDrawQueue() )
	{
		uintp eaSpuDrawHeader = ( uintp ) OpenDeferredChunk( SPUDRAWQUEUE_DEFERRED_DRAW_METHOD, 3 + ( sizeof( SpuDrawHeader_t ) + sizeof( IDirect3DVertexDeclaration9 * /*pVertDecl*/ ) ) / sizeof( uint32 ) );
		pDrawHeader = ( SpuDrawHeader_t * ) AlignValue( eaSpuDrawHeader, 16 );
	}
	else
	{
		GcmStateFlush();
		// we must be in the default batch transaction, and it must be empty so that we can switch the transaction type
		Assert( m_nCurrentBatch == BATCH_GCMSTATE && GetCurrentBatchCursor() == GetDrawQueue()->GetCursor() );
		pDrawHeader = GetDrawQueue()->AllocAligned<SpuDrawHeader_t>();
	}
	m_nCurrentBatch = BATCH_DRAW;
	Assert( GetDrawQueue()->IsValidCursor( (uint32*)( pDrawHeader + 1 ) ) );
	SetCurrentBatchCursor( ( uint32* ) pDrawHeader );
	return pDrawHeader;
}


CellSpursJob128 * CSpuGcm::PushDrawBatchJob( uint nResultantSpuDrawQueueSignal, SpuDrawHeader_t * pDrawHeader, IDirect3DVertexDeclaration9 *pVertDecl, OptimizedModel::OptimizedIndexBufferMarkupPs3_t *pIbMarkup )
{
	CellSpursJob128 * pJob = m_jobPool128.Alloc( *m_pRoot->m_pDrawIndexedPrimitive );
	pJob->header.useInOutBuffer = 1;
	// we'll DMA get textures and layouts inside the job; we'll need space for DMA elements to do so
	pJob->header.sizeScratch    = AlignValue( sizeof( JobDrawIndexedPrimitiveScratch_t ), 128 ) / 16;

	CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
	dmaConstructor.AddInputDma( sizeof( g_spuGcmShared ), &g_spuGcmShared );				 // dma[0]; must be the first to be 128-byte aligned for atomics
	dmaConstructor.AddInputDma( sizeof( *pVertDecl ), pVertDecl );                           // dma[1]
	dmaConstructor.AddInputDma( sizeof( *pDrawHeader ), pDrawHeader );						 // dma[2]

	COMPILE_TIME_ASSERT( sizeof( g_spuGcmShared ) < 16 * 1024 && sizeof( *pVertDecl ) < 16 * 1024 && sizeof( *pDrawHeader ) < 16 * 1024 );

	// pIbMarkup = pDrawHeader->m_eaIbMarkup;
	if ( pIbMarkup )
	{
		uint nIbMarkupBytes = ( pIbMarkup->m_numPartitions * sizeof( OptimizedModel::OptimizedIndexBufferMarkupPs3_t::Partition_t ) + sizeof( OptimizedModel::OptimizedIndexBufferMarkupPs3_t ) );
		dmaConstructor.AddInputDma( ( nIbMarkupBytes + 31 ) & -16, ( const void* )( uintp( pIbMarkup ) & -16 ) ); // dma[3]
	}

	//dmaConstructor.AddInputDmaLarge( SPUGCM_LSRING_SIZE, nUsefulBytesAligned, PCB_RING_CTX.begin );   // dma[4,5,6,7]
	dmaConstructor.AddSizeInOrInOut( SPUGCM_LSRING_SIZE );
	COMPILE_TIME_ASSERT( SPUGCM_LSRING_SIZE / (16*1024) <= 4 );
	// usage of the IO buffer slack:
	// alignment, sync signal, wrap sequence, alignment, RSX PUT control register output, SPURS job command output
	dmaConstructor.AddSizeInOrInOut(
		128  // potential misalignment of command buffer, for double-bandwidth DMA to command buffer (not used now)
		+ sizeof( SysringWrapSequence )  // is it accounted for in the LSRING_SLACK? 
		+ 16   // lsResetDrawBatch
		+ 16   // lsTempRsxPut
		+ 16   // g_lsDummyRead
		);
	COMPILE_TIME_ASSERT( sizeof( CPs3gcmTextureLayout::Format_t ) == 16 );
	dmaConstructor.AddCacheDma( g_nPs3texFormatCount * sizeof( CPs3gcmTextureLayout::Format_t ), g_ps3texFormats )	; // dma[8]
	dmaConstructor.FinishIoBuffer( &pJob->header );
	pJob->header.sizeStack = 16 * 1024 / 16;

	pDrawHeader->m_nPs3texFormatCount     = g_nPs3texFormatCount; // for reference; is not strictly needed here
	pDrawHeader->m_nUsefulCmdBytes        = 0;//nUsefulBytes;
	pDrawHeader->m_nPcbringBegin          = 0;//m_nPcbringBegin;	 // note: this is the post-updated buffer counter!
	pDrawHeader->m_nResultantSpuDrawGet   = nResultantSpuDrawQueueSignal;
	
	#ifdef DBGFLAG_ASSERT
	SpuDrawQueue * pSignalDrawQueue = &m_spuDrawQueues[ nResultantSpuDrawQueueSignal & 3 ? 1 : 0 ];(void)pSignalDrawQueue;
	Assert( pSignalDrawQueue->IsValidCursor( (uint32*)( nResultantSpuDrawQueueSignal & ~3 ) ) );
	#endif
	uint nResultantSpuDrawQueueIndex = nResultantSpuDrawQueueSignal & 3;
	m_pDeferredChunkSubmittedTill[ nResultantSpuDrawQueueIndex ] = ( uint32* )( nResultantSpuDrawQueueSignal & ~3 );

	Assert( CELL_OK == cellSpursCheckJob( (const CellSpursJob256 *)pJob, sizeof( *pJob ), 256 ) );
	m_eaLastJobThatUpdatesSharedState = ( uintp )pJob;
	//PCB_RING_CTX.begin = PCB_RING_CTX.current = pSkipTo;   // submitted; now when needed, we'll wait for SPU to reply through shared state
	//Assert( PCB_RING_CTX.begin <= PCB_RING_CTX.end );

	PushSpuGcmJob( pJob );
	
	// after this job runs, it spawns FPCP job, which will advance the FPCP state
	m_nFpcpStateEndOfJournalIdxAtSpuGcmJob = g_pixelShaderPatcher.GetStateEndOfJournalIdx();

	if( SPUGCM_DEBUG_MODE )
	{
		// in SPUGCM_DEBUG_MODE, we execute all jobs and wait for them to complete. So, the GET pointer should always trail our pNext pointer
		Assert( g_spuGcmShared.m_nSpuDrawGet[ nResultantSpuDrawQueueIndex ] == ( nResultantSpuDrawQueueSignal & ~3 ) );
	}

	return pJob;
}


// BUG: pVertDecl may be released right after this call, we need to copy it somewhere or addref
void CSpuGcm::SubmitDrawBatch( IDirect3DVertexDeclaration9 *pVertDecl, OptimizedModel::OptimizedIndexBufferMarkupPs3_t *pIbMarkup )
{
	Assert( m_nCurrentBatch == BATCH_DRAW );
	SpuDrawHeader_t * pDrawHeader = ( SpuDrawHeader_t * )GetCurrentBatchCursor();

	if ( pIbMarkup )
	{
		Assert( pIbMarkup->kHeaderCookie == pIbMarkup->m_uiHeaderCookie );
		// real markup exists in this index buffer
		pDrawHeader->m_eaIbMarkup       = pIbMarkup;
		pDrawHeader->m_nIbMarkupPartitions = pIbMarkup->m_numPartitions;
	}
	else
	{
		pDrawHeader->m_eaIbMarkup       = NULL;
		pDrawHeader->m_nIbMarkupPartitions = 0;
	}
	
	if( IsDeferredDrawQueue() )
	{
		*( ( IDirect3DVertexDeclaration9 ** )( pDrawHeader + 1 ) ) = pVertDecl;
		OpenDeferredChunk();
		m_nCurrentBatch = BATCH_GCMSTATE;
		ValidateDeferredQueue();
	}
	else
	{
		PushDrawBatchJob( GetDrawQueue()->GetSignal(), pDrawHeader, pVertDecl, pIbMarkup );
	
		BeginGcmStateTransaction();
		ZPassCheckpoint( 8 );

		if ( SPUGCM_DEBUG_MODE )
		{
			GCM_FUNC( cellGcmSetWriteBackEndLabel, GCM_LABEL_DEBUG0, (uint)pDrawHeader );
			CmdBufferFinish();
			volatile uint32 * pDebugLabel = cellGcmGetLabelAddress( GCM_LABEL_DEBUG0 );
			while( *pDebugLabel != ( uint ) pDrawHeader )
			{
				// this may happen due to latency , but it won't be an infinite loop
				//Msg( "Hmmmm... WriteLabel; Finish(); but label isn't set yet! 0x%X != 0x%X\n", *pDebugLabel, (uint)pDrawHeader );
				continue;
			}
		}
	}
}


bool ZPass::CanBegin( )
{
	if( m_pCursor )
	{
		return false; // already begun
	}
	
	// we need at least some memory to store the job descriptor pointers
	if( GetSubchainCapacity( ) < 32 )
	{
		Warning( "Cannot begin ZPass: zpass job subchain buffer is full\n" );
		return false;
	}
	
	// we need a buffer in spuDrawQueue to store "ZPass begin, switch, end" commands
	// we may potentially need the space to store the whole state before ZPass, too

	return true;
}


void ZPass::Begin( uint32 * pCursor )
{
	m_pCursor = pCursor;
	m_nDrawPassSubchain = m_nPut;
	m_pSubchain = GetCurrentCommandPtr();
	*m_pSubchain = CELL_SPURS_JOB_COMMAND_JTS;
	m_nFpcpStateEndOfJournalIdxAtZPassBegin = g_pixelShaderPatcher.GetStateEndOfJournalIdx();
}

void ZPass::PushCommand( uint64 nCommand )
{
	Validate();
	Assert( GetSubchainCapacity() > 2 );
	uint64 * pLwsync = GetCurrentCommandPtr();
	m_nPut++;
	uint64 * pCommand = GetCurrentCommandPtr();
	m_nPut++;
	uint64 * pJts = GetCurrentCommandPtr();
	Validate();
	
	*pJts = CELL_SPURS_JOB_COMMAND_JTS;
	*pCommand = nCommand;
	__lwsync();
	*pLwsync = CELL_SPURS_JOB_COMMAND_LWSYNC;  // release the previous JTS
}


bool CSpuGcm::BeginZPass( )
{
	if( !IsDeferredDrawQueue() && m_zPass.CanBegin() )
	{
		// debug - do not checkin
// 		while( g_pixelShaderPatcher.GetJournalSpaceLeftSince( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync ) > 20 )
// 		{
// 			g_pixelShaderPatcher.SetFragmentRegisterBlock(95, 1, (const float*)&g_spuGcmShared.m_eaFpcpSharedState->m_reg[95] );
// 		}
		if( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob != g_pixelShaderPatcher.GetStateEndOfJournalIdx() )
		{
			GetDrawQueue()->Push2( SPUDRAWQUEUE_FLUSH_FPCP_JOURNAL, g_pixelShaderPatcher.GetStateEndOfJournalIdx() );
		}


		// this is where we start commands that we'll need to replay
		uint32 * pCursorBegin = GetDrawQueue()->GetCursor();
		uint nSafetyBufferWords = 4 ; // buffer so that when we come around, we can insert EndZPostPass method command (at least 3 words)
		uint nCommandWords = 			  2  // command : the command and EA of ZPassSavedState_t
			+ nSafetyBufferWords
			+ 4  // alignment buffer for ZPassSavedState_t
			+ sizeof( ZPassSavedState_t );
		m_zPass.m_nJobPoolMarker = m_jobPool128.GetMarker();

		uint32 * pCmdBeginZPrepass = GetDrawQueue()->AllocWords( nCommandWords );
		pCmdBeginZPrepass[0] = SPUDRAWQUEUE_BEGINZPREPASS_METHOD | ( SPUDRAWQUEUE_BEGINZPREPASS_MASK & nCommandWords );
		ZPassSavedState_t  * pSavedState = ( ZPassSavedState_t * )AlignValue( uintp( pCmdBeginZPrepass + 2 + nSafetyBufferWords ), 16 );
		pCmdBeginZPrepass[1] = ( uintp )pSavedState;
		m_zPass.m_pSavedState = pSavedState;
		
		// 
		// WARNING.
		// 
		// SPUDRAWQUEUE_BEGINZPREPASS_METHOD  must be the last method that modifies g_spuGcmShared.m_dxGcmState in a job_gcmflush SpuDrawQueue.
		// This is because its implementation doesn't wait for DMA put to finish.
		//
		
		GCM_PERF_PUSH_MARKER( "ZPass_Z" );
		CmdBufferFlush();

		// actually begin; don't let anyone overwrite the commands after cursor 
		m_zPass.Begin( pCursorBegin );
		GetDrawQueue()->PushFlushCallback( OnSpuDrawQueueFlushInZPass );
		PushSpuGcmCallSubchain( m_zPass.m_pSubchain ); // call all those SPUGCM jobs for the first time
		return true;
	}
	else
		return false;
}

void CSpuGcm::SetPredication( uint nPredicationMask ) // D3DPRED_* mask
{
	uint32 * pCmd = GetDrawQueue()->AllocWords( 1 );
	*pCmd = SPUDRAWQUEUE_PREDICATION_METHOD | ( SPUDRAWQUEUE_PREDICATION_MASK & nPredicationMask );
}


void CSpuGcm::EndZPass( bool bPopMarker )
{
	if( m_zPass && !m_zPass.m_isInEndZPass )
	{
		m_zPass.m_isInEndZPass = 1;
		GetDrawQueue()->PopFlushCallback();
		
		// as a precaution, since we don't need watermark-flush callbacks for the duration of this function, we'll disable it to avoid recursive flushes
		GetDrawQueue()->PushFlushCallback( OnSpuDrawQueueFlushDoNothing );
		
		// flush whatever state we may have.. it's not really needed to replay it twice, but whatever. we do need to replay it the 2nd time, and we can't just skip on it easily now in the 1st pass
		CmdBufferFlush();
		m_zPass.PushCommand( CELL_SPURS_JOB_COMMAND_RET );
		m_zPass.End(); //  at this point, there's no more "Z prepass". There's just a bunch of SPUGCM commands waiting to be executed
		
		// replay from cursor
		uint32 * pCmdEndZPrepass = GetDrawQueue()->AllocWords( 2 );
		//m_nGcmFlushJobScratchSize = MAX( m_nGcmFlushJobScratchSize, CELL_GCM_VTXPRG_MAX_CONST );
		pCmdEndZPrepass[0] = SPUDRAWQUEUE_ENDZPREPASS_METHOD;
		pCmdEndZPrepass[1] = ( uintp )m_zPass.m_pSavedState;
		if( bPopMarker )
		{
			GCM_PERF_POP_MARKER( /*"ZPass_Z"*/ ); 
			GCM_PERF_MARKER( "ZPass_ZEnd" );
		}
		else
		{
			GCM_PERF_MARKER( "ZPass_Abort" );
		}

		CmdBufferFlush(); // commit the "End Z Prepass" command. NOTE: we don't want to commit it twice, so we End ZPass BEFORE we commit this command

		// even though Z Prepass is ended now, all those commands and their memory are still intact
		// re-execute them here now
		PushSpuGcmCallSubchain( m_zPass.m_pSubchain ); // call all those SPUGCM jobs again!

		GetDrawQueue()->PopFlushCallback();
		// SPUGCM ring release point: after this point, we can simply wait for more space to become available in SPUGCM draw command ring
		
		// Do we need to really end the render pass? 
		// Hopefully not, because hopefully it'll just organically be indistinguishable from the non-Z-prepassed rendering		

		uint32 * pCmdEndZPostPass = GetDrawQueue()->AllocWords( 3 );
		pCmdEndZPostPass[0] = SPUDRAWQUEUE_ENDZPOSTPASS_METHOD;
		pCmdEndZPostPass[1] = m_zPass.m_nPut;
		pCmdEndZPostPass[2] = (uintp)&m_zPass.m_nGet;
		GCM_PERF_MARKER( bPopMarker ? "ZPass_RenderEnd" : "AbortedZPass_RenderEnd" );
		CmdBufferFlush();
		
		m_zPass.m_isInEndZPass = 0;
		
	}
	else
	{
		if( bPopMarker )
		{
			GCM_PERF_POP_MARKER( );
		}
	}
}


void ZPass::Init()
{
	m_nDummy = 0;
	m_pCursor = NULL;
	m_nJobs = 2048;
	m_pJobs = (uint64*)MemAlloc_AllocAligned( ( m_nJobs + 1 )* sizeof( uint64 ), 16 );
	m_pJobs[m_nJobs] = CELL_SPURS_JOB_COMMAND_NEXT( m_pJobs );
	m_nGet = 0;
	m_nPut = 0;
	m_isInEndZPass = 0;
}

void ZPass::Shutdown()
{
	MemAlloc_FreeAligned( m_pJobs );
}


//#endif

uint g_nEdgeJobChainMaxContention = 5;

void CSpuGcm::OnVjobsInit()
{
	int nJobPoolCount = Max<uint>( 256, g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment() * 4 );
	int nCmdLineJobPoolCount = CommandLine()->ParmValue( "-spugcmJobPool", nJobPoolCount );
	if( nCmdLineJobPoolCount > nJobPoolCount && !( nCmdLineJobPoolCount & ( nCmdLineJobPoolCount - 1 ) ) )
	{
		Msg("Increasing spugcm cjob pool count from %d to %d\n", nJobPoolCount, nCmdLineJobPoolCount );
		nJobPoolCount = nCmdLineJobPoolCount;
	}
	// priority lower than the main job queue, in order to yield
	if( int nError = m_jobSink.Init( m_pRoot, 1, nJobPoolCount, ( uint8_t* )&m_pRoot->m_nSpugcmChainPriority, "spugcm", DMATAG_GCM_JOBCHAIN ) )
	{
		Error( "Cannot init SpuGcm, cell error %d\n", nError );
	}

	COMPILE_TIME_ASSERT( sizeof( job_edgegeom::JobDescriptor_t ) == 512 );
	if( int nError = g_spuGcmShared.m_edgeJobChain.Init( m_pRoot, g_nEdgeJobChainMaxContention, 128, ( uint8_t* )&m_pRoot->m_nEdgeChainPriority, sizeof( job_edgegeom::JobDescriptor_t ), CELL_SPURS_JOBQUEUE_DEFAULT_MAX_GRAB, "edge", DMATAG_EDGE_JOBCHAIN ) )
	{
		Error(" Cannot init SpuGcm, edge jobchain, error %d\n", nError );
	}
	
	if( int nError = g_spuGcmShared.m_fpcpJobChain.Init( m_pRoot, 1, 512, ( uint8_t* )&m_pRoot->m_nFpcpChainPriority, 128, CELL_SPURS_JOBQUEUE_DEFAULT_MAX_GRAB, "fpcp", DMATAG_FPCP_JOBCHAIN ) )
	{
		Error(" Cannot init SpuGcm, fpcp jobchain, error %d\n", nError );
	}

	if( nJobPoolCount < g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment() * 4 ) // we need at least this much to avoid at least most stalls
	{
		Error( "Job pool count %d is too small! With %d jobs per segment, make it at least %d\n", nJobPoolCount, g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment(), g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment() * 4 );
	}

	
	m_jobPool128.Init( nJobPoolCount ); 
	g_spuGcmShared.m_jobPoolEdgeGeom.Init( 128 );
	
	g_spuGcmShared.m_jobFpcPatch2 = *( m_pRoot->m_pFpcPatch2 );
	g_spuGcmShared.m_jobEdgeGeom = *( m_pRoot->m_pEdgeGeom );

	if( m_pMlaaBuffer )	
	{
		g_edgePostWorkload.OnVjobsInit( m_pRoot );
	}
}


#if 0 // priorities test
bool PriorityTest_t::Test( class VjobChain4 *pJobChain )
{
	m_notify.m_nCopyFrom = 1;
	m_notify.m_nCopyTo = 0;
	uint nTick0 = __mftb();
	pJobChain->Run();
	uint nTick1 = __mftb();
	*( pJobChain->Push() ) = CELL_SPURS_JOB_COMMAND_JOB( &m_job );
	uint nTick2 = __mftb(), nTick3;
	do
	{
		nTick3 = __mftb();
		if( nTick3 - nTick2 > 79800000 * 5 )
		{
			Msg("%s:HANG\n", pJobChain->GetName());
			return false;
		}
	}
	while( !*(volatile uint32*)&m_notify.m_nCopyTo );

	Msg("%s[%d]:%5.0f+%5.0f(run=%5.0f)\n", pJobChain->GetName(), m_notify.m_nSpuId, (nTick2-nTick1)*40.1f, (nTick3-nTick2)*40.1f, (nTick1 - nTick0) * 40.1f );
	return true;
}


void CSpuGcm::TestPriorities()
{
	PriorityTest_t * pTest = (PriorityTest_t*)MemAlloc_AllocAligned( sizeof( PriorityTest_t ), 128 );

	V_memset( &pTest->m_job, 0, sizeof( pTest->m_job ) );
	pTest->m_job.header = *(m_pRoot->m_pJobNotify);
	pTest->m_job.header.useInOutBuffer = 1;
	AddInputDma( &pTest->m_job, sizeof( pTest->m_notify ), &pTest->m_notify );
	pTest->m_job.workArea.userData[1] = 0; // function: default 

	for( uint i  = 0; i < 50; ++ i)
	{
		if( !pTest->Test( &g_spuGcmShared.m_edgeJobChain ) )
			return ; // leak
		if( ! pTest->Test( &g_spuGcmShared.m_fpcpJobChain ) )
			return ; // leak
	}

	MemAlloc_FreeAligned( pTest );
}
#endif

void CSpuGcm::OnVjobsShutdown() // gets called before m_pRoot is about to be destructed and NULL'ed
{
	CmdBufferFinish();
	g_edgePostWorkload.OnVjobsShutdown( m_pRoot );
		
	// in case of priority issues with job chains (when experimenting with reload_vjobs), let's first end and then join all workloads
	m_jobSink.End();
	g_spuGcmShared.m_fpcpJobChain.End();
	g_spuGcmShared.m_edgeJobChain.End();

	m_jobSink.Join();
	g_spuGcmShared.m_fpcpJobChain.Join();
	g_spuGcmShared.m_edgeJobChain.Join();
	
	m_jobPool128.Shutdown();
	
	g_spuGcmShared.m_jobPoolEdgeGeom.Shutdown();
}

void CSpuGcm::Shutdown()
{
	g_pVJobs->Unregister( this ); // note: this will also call VjobsShutdown, which will join all SPU workloads and effectively call CmdBufferFinish();
	g_edgeGeomRing.Shutdown();
	if( m_pPcbringBuffer )
	{
		MemAlloc_FreeAligned( m_pPcbringBuffer );
	}
	m_spuDrawQueues[1].Shutdown();
	m_spuDrawQueues[0].Shutdown();
#if SPU_GCM_DEBUG_TRACE
	MemAlloc_FreeAligned( g_spuGcmShared.m_eaDebugTraceBuffer );
#endif
	m_zPass.Shutdown();

	for( uint i = 0; i < ARRAYSIZE( m_pDeferredStates ); ++i )
	{
		g_ps3gcmGlobalState.IoSlackFree( m_pDeferredStates[i] );
	}
}


void CSpuGcm::BeginScene()
{
	DrawQueueNormal();
	if( m_nFramesToDisableDeferredQueue > 0 )
	{
		m_nFramesToDisableDeferredQueue-- ;
	}
}


void CSpuGcm::EndScene()
{
	g_aici.m_nCpuActivityMask = g_edgeGeomRing.m_nUsedSpus;
	g_edgeGeomRing.m_nUsedSpus = 0;

	g_aici.m_nDeferredWordsAllocated = m_spuDrawQueues[1].m_nAllocWords - m_nDeferredQueueWords;
	m_nDeferredQueueWords = m_spuDrawQueues[1].m_nAllocWords;
	

	if( m_zPass )	
	{
		ExecuteNTimes( 100, Warning( "SpuGcm:EndScene must Abort ZPass; mismatched BeginZPass/EndZPass\n" ) );
		AbortZPass();
	}
	
	if( g_spuGcmShared.m_enableStallWarnings )
	{
	
		if( m_jobPool128.m_nWaitSpins > 100 )
		{
			if( g_spuGcmShared.m_enableStallWarnings )
			{
				Warning( "SpuGcm: %d spins in job pool, PPU is really ahead of SPU and (probably) RSX.\n", m_jobPool128.m_nWaitSpins );
			}
		}
		m_jobPool128.m_nWaitSpins = 0;
		
	/*
		if( m_jobPool256.m_nWaitSpins )
		{
			if( g_spuGcmShared.m_enableStallWarnings )
			{
				Warning( "SpuGcm: %d spins in job pool 256, PPU is really ahead of SPU and (probably) RSX.\n", m_jobPool256.m_nWaitSpins );
			}
			m_jobPool256.m_nWaitSpins = 0;
		}
	*/

		if( m_nPcbringWaitSpins > 100 )
		{
			if( g_spuGcmShared.m_enableStallWarnings )
			{
				Warning( "SpuGcm: %d spins in PcbRing, PPU is waiting for SPU (possibly) waiting for RSX\n", m_nPcbringWaitSpins );
			}
		}
		m_nPcbringWaitSpins = 0;
	}
	m_nFrame++;
	
	COMPILE_TIME_ASSERT( ARRAYSIZE( m_pDeferredStates ) == 2 ); // we need to rotate the array if it's not 2-element
	Swap( m_pDeferredStates[0], m_pDeferredStates[1] );

	extern ConVar r_ps3_mlaa;
	m_bUseDeferredDrawQueue = m_pMlaaBuffer && !( r_ps3_mlaa.GetInt() & 16 );
}


void CSpuGcm::CmdBufferFinish()
{
#ifdef CELL_GCM_DEBUG // [
	extern void (*fnSaveCellGcmDebugCallback)(struct CellGcmContextData*) = gCellGcmDebugCallback;
	gCellGcmDebugCallback = NULL;   // disable recursive callback 
#endif // ]

	s_nFinishLabelValue++;
	GCM_FUNC( cellGcmSetWriteBackEndLabel, GCM_LABEL_SPUGCM_FINISH, s_nFinishLabelValue );
	CmdBufferFlush();
	Assert( s_nStopAtFinishLabelValue != s_nFinishLabelValue );

	// now wait for RSX to reach
	uint nSpins = 0;
	uint nTbStart = __mftb();
	volatile uint32 * pLastJobUpdate = &g_spuGcmShared.m_eaLastJobThatUpdatedMe;
	while( ( s_nFinishLabelValue != *m_pFinishLabel ) ||
		( *pLastJobUpdate != m_eaLastJobThatUpdatesSharedState ) )
	{
		sys_timer_usleep( 30 ); // don't hog the PPU
		++nSpins;
#ifndef _CERT
		if( nSpins && ( nSpins % 100000 == 0 ) )
		{
			Warning(
				"** SpuGcm detected an SPU/RSX hang. **\n"
				);
		}
#endif
	}

	uint nTbEnd = __mftb();
	
	if( nSpins > 1000 )
	{
		Warning( "Long wait (%d us / %d spins) in CmdBufferFinish()\n", ( nTbEnd - nTbStart ) / 80, nSpins ); 
	}
	
#ifdef CELL_GCM_DEBUG // [
	gCellGcmDebugCallback = fnSaveCellGcmDebugCallback;
#endif // ]
}


void CSpuGcm::SyncMlaa( void * pLocalSurface )
{
	uint nInSurfaceOffset = ( g_ps3gcmGlobalState.m_nRenderSize[1]/2 * g_ps3gcmGlobalState.m_nSurfaceRenderPitch ) & -16;
	vec_int4 * pIn = ( vec_int4 * )( ( uintp( m_pMlaaBuffer ) + nInSurfaceOffset ) ), *pOut = ( vec_int4 * ) ( uintp( pLocalSurface ) + nInSurfaceOffset );
	
	
	uint nRowWidth = g_ps3gcmGlobalState.m_nSurfaceRenderPitch/64, nExclude = ( m_nFrame % ( nRowWidth - 2 ) ) + 1;
	for( uint nRow = 0; nRow < 4; ++nRow )
	{
		vec_int4 * pRowIn = AddBytes( pIn, g_ps3gcmGlobalState.m_nSurfaceRenderPitch * nRow );
		vec_int4 * pRowOut = AddBytes( pOut, g_ps3gcmGlobalState.m_nSurfaceRenderPitch * nRow );
		for( uint i = 0; i < nExclude; i ++ )
		{
			vec_int4 *input = pRowIn + i * 4, *output = pRowOut + i * 4;
			output[0] = vec_nor( input[0], input[0] );
			output[1] = vec_nor( input[1], input[1] );
			output[2] = vec_nor( input[2], input[2] );
			output[3] = vec_nor( input[3], input[3] );
		}
		
		for( uint i = nExclude + 1; i < nRowWidth ; ++i )
		{
			vec_int4 *input = pRowIn + i*4, *output = pRowOut + i*4;
			output[0] = vec_nor( input[0], input[0] );
			output[1] = vec_nor( input[1], input[1] );
			output[2] = vec_nor( input[2], input[2] );
			output[3] = vec_nor( input[3], input[3] );
		}
	}	
}


void CSpuGcm::CloseDeferredChunk()
{
	Assert( m_nSpuDrawQueueSelector == 1 );
	uint32 * pDeferredQueueCursor = m_spuDrawQueues[1].GetCursor();
	if( m_pDeferredChunkHead )
	{
		#ifdef _DEBUG
		m_nChunksClosedInSegment++;
		#endif
		// mark the previous chunk with its end
		m_pDeferredChunkHead[1] = ( uint32 )pDeferredQueueCursor;
		m_pDeferredChunkHead = NULL;
	}
	m_pDeferredQueueCursors[0] = pDeferredQueueCursor;
	ValidateDeferredQueue();
}


#if SPUGCM_DEBUG_MODE
uint g_nDeferredChunks[0x800][4], g_nDeferredChunkCount = 0;
#endif


uint32* CSpuGcm::OpenDeferredChunk( uint nHeader, uint nAllocExtra )
{
	Assert( IsValidDeferredHeader( nHeader ) );
	Assert( m_nSpuDrawQueueSelector == 1 );
	
	// skip allocation of the new chunk if the current chunk is empty
	if( !m_pDeferredChunkHead || m_pDeferredChunkHead + SPUDRAWQUEUE_DEFERRED_HEADER_WORDS != GetDrawQueue()->GetCursor() || nAllocExtra > 0 )
	{
		// we don't have an empty chunk already; allocate more
		CloseDeferredChunk();
		m_pDeferredChunkHead = GetDrawQueue()->AllocWords( SPUDRAWQUEUE_DEFERRED_HEADER_WORDS + nAllocExtra );
	}
	m_pDeferredChunkHead[0] = nHeader; // just flush state by default
	m_nDeferredChunkHead = nHeader;
	m_pDeferredChunkHead[1] = ( uintp )GetDrawQueue()->GetCursor();

	ValidateDeferredQueue();

	#ifdef _DEBUG
	if( SPUDRAWQUEUE_DEFERRED_HEADER_WORDS > 2 )
	{
		m_pDeferredChunkHead[2] = GetDrawQueue()->m_nAllocCount;
	}
	#endif
	#if SPUGCM_DEBUG_MODE
	uint nIdx = (g_nDeferredChunkCount++)%(ARRAYSIZE(g_nDeferredChunks));
	Assert( nIdx < ARRAYSIZE(g_nDeferredChunks) );
	uint * pDebug = g_nDeferredChunks[nIdx];
	pDebug[0] = nHeader;
	pDebug[1] = (uint32)m_pDeferredChunkHead;
	pDebug[2] = nAllocExtra;
	pDebug[3] = GetDrawQueue()->m_nAllocCount;
	#endif
	GetDrawQueue()->SetFlushWatermarkFrom( m_pDeferredChunkHead );
	return m_pDeferredChunkHead + SPUDRAWQUEUE_DEFERRED_HEADER_WORDS;
}


void CSpuGcm::DrawQueueNormal( bool bExecuteDeferredQueueSegment )
{
	if( m_nSpuDrawQueueSelector != 0 )
	{
		FillNops( &PCB_RING_CTX ); // switching draw queues, preallocated gcm context no longer usable
		Assert( *m_pDeferredChunkHead == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD );
		GetDrawQueue()->Push1( SPUDRAWQUEUE_PERF_MARKER_DrawNormal );
		CloseDeferredChunk();
		m_pDeferredQueueCursors[0] = m_spuDrawQueues[1].GetCursor();
		/*uint nBytesInSegment = m_spuDrawQueues[1].Length( m_pDeferredQueueSegment, m_pDeferredQueueCursors[0] );
		Msg( "DrawQueueNormal %p..%p=%.1fKB (%p,%p)\n", m_pDeferredQueueSegment, m_pDeferredQueueCursors[0], 
			nBytesInSegment / 1024.0f,
			m_pDeferredQueueCursors[1], m_pDeferredQueueCursors[2] );*/
		m_nSpuDrawQueueSelector = 0;
		if( m_pDeferredQueueSegment && bExecuteDeferredQueueSegment )
		{
			ExecuteDeferredDrawQueueSegment( m_pDeferredQueueSegment, m_pDeferredQueueCursors[0], false );
			m_pDeferredQueueSegment = NULL;
		}

		Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );
		m_pDeferredChunkHead = NULL;
		BeginGcmStateTransaction();
	}						
	if( m_nFramesToDisableDeferredQueue > 0 )
	{
		ExecuteDeferredDrawQueue( 0 );
	}
}


/*
void CSpuGcm::DisableMlaaForTwoFrames()
{
	g_flipHandler.DisableMlaaForTwoFrames();
	m_nFramesToDisableDeferredQueue = 2; // this frame and next will have disabled deferred queue
	DrawQueueNormal();
}
*/

void CSpuGcm::DisableMlaa()
{
	DrawQueueNormal( false );
	// we could, but we don't have to flush the previous frame:
	// we'll do that at Flip, the same way we do it every time
	g_flipHandler.DisableMlaa();
}

void CSpuGcm::DisableMlaaPermanently()
{
	DrawQueueNormal( false );

	g_flipHandler.QmsAdviceBeforeDrawPrevFramebuffer();
	// flush previous frame first
	ExecuteDeferredDrawQueue( 1 );

	g_flipHandler.DisableMlaaPermannetly();
	g_flipHandler.DisableMlaa();

	extern void DxDeviceForceUpdateRenderTarget( );
	DxDeviceForceUpdateRenderTarget( ); // recover main render target, as it was screwed up by execution of previous frame's commands
	ExecuteDeferredDrawQueue( 0 );
}


CSpuGcm::DrawQueueDeferred_Result CSpuGcm::DrawQueueDeferred() // may flush previous frame deferred queue the first time
{
	DrawQueueDeferred_Result result;
	if( m_bUseDeferredDrawQueue && ( m_nFramesToDisableDeferredQueue == 0 ) && ( m_nSpuDrawQueueSelector != 1 ) )
	{
		FillNops( &PCB_RING_CTX ); // switching draw queues, preallocated gcm context no longer usable
		// do we have anything in the deferred queue?
		result.isFirstInFrame = m_pDeferredQueueCursors[0] == m_pDeferredQueueCursors[1];
		GetDrawQueue()->Push1( SPUDRAWQUEUE_PERF_MARKER_DrawDeferred );
		if( result.isFirstInFrame )
		{
			GetDrawQueue()->Push2( SPUDRAWQUEUE_DEFER_STATE, uintp( m_pDeferredStates[0] ) );
		}
		// before we dive into deferred queue, we flush the current queue, because we'll have to restart current queue when we dive out of deferred queue
		// this will also make sure that any state dump required for deferred queue to execute will be dumped before deferred queue will try to execute
		GcmStateFlush();
		Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );
		//ExecuteDeferredDrawQueue( 1 ); // dubious: we might want to execute this in the end of the frame to avoid undesirable state changes 
		m_nSpuDrawQueueSelector = 1;
		BeginGcmStateTransaction();
		m_pDeferredQueueSegment = m_spuDrawQueues[1].GetCursor();
		#ifdef _DEBUG
		m_nChunksClosedInSegment = 0;
		#endif
		//Msg( "DrawQueueDeferred %p / %.1f KB free...", m_pDeferredQueueSegment, m_spuDrawQueues[1].Length( m_pDeferredQueueSegment, m_spuDrawQueues[1].m_pGet ) );
		OpenDeferredChunk( SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD );
		if( result.isFirstInFrame ) // we defer the "UNDEFER" command in here
		{
			GetDrawQueue()->Push2( SPUDRAWQUEUE_UNDEFER_STATE, uintp( m_pDeferredStates[0] ) );
		}
	}
	else
	{	
		result.isFirstInFrame = false;
	}
	return result;
}


// returns: true if some memory will be freed up by SPU by poking into corresponding GET pointer later
bool CSpuGcm::ExecuteDeferredDrawQueue( uint nPrevious )
{
	Assert( !IsDeferredDrawQueue() );
	
	// just copy the commands to the main spugcm buffer
	Assert( m_pDeferredQueueCursors[0] == m_spuDrawQueues[1].GetCursor() || m_pDeferredQueueCursors[0] == m_pDeferredChunkHead );
	
	uint32 * pCmdEnd = m_pDeferredQueueCursors[nPrevious];//, *pCmdEnd = ( ( nPrevious == 0 ) ? m_spuDrawQueues[1].GetCursor() : m_pDeferredQueueCursors[ nPrevious - 1 ] );
	uint32 * pCmdBegin = m_pDeferredQueueCursors[ARRAYSIZE(m_pDeferredQueueCursors)-1];
	if( pCmdEnd == pCmdBegin )
		return false;
	//Msg( "ExecuteDeferredDrawQueue(%d) %p..%p=%.1fKB\n", nPrevious, pCmdBegin, pCmdEnd, m_spuDrawQueues[1].Length( pCmdBegin, pCmdEnd ) );
		
	FillNops( &PCB_RING_CTX );
	#if defined( _DEBUG ) && !defined( _CERT )
	m_spuDrawQueues[0].Push1( SPUDRAWQUEUE_PERF_MARKER_AAReplay );
	#endif
	
	GcmStateFlush();
	Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );// we're not deferred; so, GcmStateFlush calls BeginGcmStateTransaction that will reset the current batch cursor

	bool bMoveGet = ExecuteDeferredDrawQueueSegment( pCmdBegin, pCmdEnd, true );

	#if defined( _DEBUG ) && !defined( _CERT )
	m_spuDrawQueues[0].Push1( SPUDRAWQUEUE_PERF_MARKER_AAReplayEnd );
	SetCurrentBatchCursor( GetDrawQueue()->GetCursor() );
	#endif
	
	// forget about previously executed frames/chunks
	for( uint i = nPrevious + 1; i < ARRAYSIZE( m_pDeferredQueueCursors ); ++i )
		m_pDeferredQueueCursors[i] = pCmdEnd;

	return bMoveGet;
}


bool CSpuGcm::ExecuteDeferredDrawQueueSegment( uint32 * pCmdBegin, uint32 * pCmdEnd, bool bExecuteDraws )
{
	Assert( m_nCurrentBatch == BATCH_GCMSTATE );
	// if we're in deferred queue, we should switch to normal queue before drawing from deferred to normal queue
	Assert( !IsDeferredDrawQueue() );
	bool bMoveGet = false;
	uint nResultantSpuDrawQueueIndex = bExecuteDraws ? 1 : 2; // [2] is a dummy write-only resultant "GET" register..

#if SPUGCM_DEBUG_MODE
	uint nDeferredChunkDebugIdx = 0xFFFFFFFF;
	for( uint i = 1;i <= ARRAYSIZE( g_nDeferredChunks ); ++i )
	{
		uint j = ( g_nDeferredChunkCount - i ) & ( ARRAYSIZE( g_nDeferredChunks ) - 1 );
		if( g_nDeferredChunks[j][1] == uintp( pCmdBegin ) )
		{
			nDeferredChunkDebugIdx = j;
			break;
		}
	}
	Assert( nDeferredChunkDebugIdx < ARRAYSIZE( g_nDeferredChunks ) );
#endif

	SpuDrawQueue *pDrawQueue = &m_spuDrawQueues[1];
	for( uint32 * pCmd = pDrawQueue->NormalizeCursor( pCmdBegin ), * pCmdNormalizedEnd = pDrawQueue->NormalizeCursor( pCmdEnd ), *pPrev = pCmd; pCmd != pCmdNormalizedEnd; )
	{
		if( !IsCert() && !pDrawQueue->IsValidCursor( pCmd ) ) 
			DebuggerBreakIfDebugging();
		uint nCmd = *pCmd;
		if( nCmd == 0 )
		{
			pCmd++;
		}
		else if( ( nCmd & SPUDRAWQUEUE_METHOD_MASK ) == SPUDRAWQUEUE_NOPCOUNT_METHOD )
		{
			pCmd += 1 + ( nCmd & SPUDRAWQUEUE_NOPCOUNT_MASK );
		}
		else
		{
			uint32 * pNext = (uint32*)pCmd[1], *pCmdHeaderEnd = pCmd + SPUDRAWQUEUE_DEFERRED_HEADER_WORDS;
			Assert( m_spuDrawQueues[1].IsValidCursor( pNext ) );

#if SPUGCM_DEBUG_MODE
			for( uint i = 0; ; ++i )
			{
				uint j = ( nDeferredChunkDebugIdx + i ) & ( ARRAYSIZE( g_nDeferredChunks ) - 1 );
				if( g_nDeferredChunks[j][1] == uintp( pCmd ) )
				{
					nDeferredChunkDebugIdx = j;
					break;
				}
				if( i >= ARRAYSIZE( g_nDeferredChunks ) ) // stop if we don't find the  debug idx
				{
					DebuggerBreak();
					break;
				}
			}
#endif

			switch ( nCmd & SPUDRAWQUEUE_DEFERRED_METHOD_MASK )
			{
			case SPUDRAWQUEUE_DEFERRED_SET_FP_CONST_METHOD:
				{
					uint nStartRegister = ( nCmd >> 12 ) & 0xFFF, nRegisterCount = nCmd & 0xFFF;
					Assert( nStartRegister < 96 && nRegisterCount <= 96 );
					OnSetPixelShaderConstant();
					g_pixelShaderPatcher.SetFragmentRegisterBlock( nStartRegister, nRegisterCount, ( const float* )pCmdHeaderEnd );
					//m_dirtyCachesMask |= DxAbstractGcmState_t::kDirtyPxConstants;
				}
				break;
				

			case SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD:
				if( nCmd == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD || bExecuteDraws )
				{
					PushStateFlushJob( pDrawQueue, uint( pNext ) | nResultantSpuDrawQueueIndex, pCmdHeaderEnd, pNext );
					Assert( m_pDeferredChunkSubmittedTill[nResultantSpuDrawQueueIndex] == pNext );
					bMoveGet = true;
				}
				break;

			case SPUDRAWQUEUE_DEFERRED_DRAW_METHOD:
				if( bExecuteDraws )
				{
					Assert( nCmd == SPUDRAWQUEUE_DEFERRED_DRAW_METHOD );
					SpuDrawHeader_t * pDrawHeader = ( SpuDrawHeader_t * )AlignValue( uintp( pCmdHeaderEnd ), 16 );
					// at the time we set up these deferred calls, we don't track the FPCP journal, so we need to refresh the indices referring into it here
					pDrawHeader->m_nFpcpEndOfJournalIdx = g_pixelShaderPatcher.GetStateEndOfJournalIdx();
					CellSpursJob128 * pDrawJob = PushDrawBatchJob( uint( pNext ) | nResultantSpuDrawQueueIndex, pDrawHeader, *( IDirect3DVertexDeclaration9** )( pDrawHeader + 1 ), pDrawHeader->m_eaIbMarkup );
					Assert( m_pDeferredChunkSubmittedTill[nResultantSpuDrawQueueIndex] == pNext ); 
					bMoveGet = true;
				}
				break;
			}
			pPrev = pCmd;
			pCmd = pNext;	
		}
		pCmd = pDrawQueue->NormalizeCursor( pCmd );
	}

	return bMoveGet;
}


void CSpuGcm::FlipDeferredDrawQueue()
{
	//Msg( "FlipDeferredDrawQueue {%p,%p,%p} Frame=%d\n", m_pDeferredQueueCursors[0], m_pDeferredQueueCursors[1], m_pDeferredQueueCursors[2], m_nFrame );
	Assert( !IsDeferredDrawQueue() );
	m_pDeferredQueueCursors[0] = m_spuDrawQueues[1].GetCursor();
	for( uint i = ARRAYSIZE( m_pDeferredQueueCursors ); i-- > 1; )
	{
		m_pDeferredQueueCursors[ i ] = m_pDeferredQueueCursors[ i - 1 ];
	}
}


void CEdgePostWorkload::OnVjobsInit( VJobsRoot* pRoot )
{
	uint numSpus = 5, nScratchSize = EDGE_POST_MLAA_HANDLER_SPU_BUFFER_SIZE( numSpus ) * 3;
	m_pMlaaScratch = MemAlloc_AllocAligned( nScratchSize, EDGE_POST_MLAA_HANDLER_BUFFER_ALIGN );
	int nOk = edgePostMlaaInitializeContext( &m_mlaaContext, numSpus, &pRoot->m_spurs, ( uint8_t* )&pRoot->m_nEdgePostWorkloadPriority, GCM_LABEL_EDGEPOSTMLAA, m_pMlaaScratch, nScratchSize );
	if( nOk != CELL_OK )
	{
		Warning("Cannot initialize MLAA, error %d\n", nOk );
		edgePostMlaaDestroyContext( &m_mlaaContext );
		MemAlloc_FreeAligned( m_pMlaaScratch );
		return;
	}
	m_isInitialized = true;
	
	
}

void CEdgePostWorkload::OnVjobsShutdown( VJobsRoot* pRoot )
{
	if( m_isInitialized )
	{
		edgePostMlaaDestroyContext( &m_mlaaContext );
		MemAlloc_FreeAligned( m_pMlaaScratch );
		m_isInitialized = false;
	}
}


int32_t GhostGcmCtxCallback( struct CellGcmContextData *pContext, uint32_t nCount )
{
	Error("Trying to allocate %d more words in the ghost context\n", nCount );
	return CELL_ERROR_ERROR_FLAG;
}

enum TruePauseStateEnum_t
{
	TRUE_PAUSE_NONE,
	TRUE_PAUSE_SPINNING,
	TRUE_PAUSE_LOCKED0, // locked, Shoulder and X buttons down
	TRUE_PAUSE_LOCKED1, // locked, Shoulder button up
	TRUE_PAUSE_SINGLE_STEP
};					  

TruePauseStateEnum_t g_nTruePauseState = TRUE_PAUSE_NONE;


bool CSpuGcm::TruePause()
{
	switch( g_nTruePauseState )
	{
		case TRUE_PAUSE_NONE:
			g_nTruePauseState = TRUE_PAUSE_SPINNING;
		case TRUE_PAUSE_SINGLE_STEP:
			break; // re-entering after single step
		default:
			g_nTruePauseState = TRUE_PAUSE_NONE;
			return false; // inconsistent state, don't try to continue
	}

	CmdBufferFinish(); // this'll put the end marker to the last frame.
	g_spuGcmShared.m_sysring.NotifyRsxGet( g_spuGcmShared.m_eaGcmControlRegister->get );
	
	//Assert( g_spuGcmShared.m_sysring.m_nPut == g_spuGcmShared.m_sysring.m_nEnd );
	const uint nReserve = 0x1000;
	if( !g_spuGcmShared.m_sysring.CanPutNoWrap( nReserve ) )
	{
		if( !g_spuGcmShared.m_sysring.CanWrapAndPut( nReserve ) )
		{
			Msg( "Cannot replay because sysring wraps around right here and you got unlucky. If you get this a lot, ask Sergiy to implement/fix wrap-around replay\n" );
			return false;
		}
		g_spuGcmShared.WrapSequence();
	}
	
	int nReplayFrames = 2;
	
	if( !g_spuGcmShared.CanReplayPastFrames( nReplayFrames, nReserve ) )
	{
		uint nSysringBytesNeeded = 0;
		Warning( "Cannot replay frames: %d frames didn't fit into command buffer of %d bytes and was generated and executed in multiple passes/segments\n", nReplayFrames, g_ps3gcmGlobalState.m_nCmdSize );
		return false;
	}
	
	// all relevant SPU, RSX activity ceased at this point
	uintp eaEnd = g_spuGcmShared.m_sysring.EaPut();
	uint32 * pEnd = (uint32*)eaEnd;
	uint nIoOffsetEnd = eaEnd + g_spuGcmShared.m_nIoOffsetDelta;
	
	//nOffsetBeginFrame = g_spuGcmShared.m_sysring.PutToEa( g_spuGcmShared.GetPastFrame(2).m_nSysringBegin ) + g_spuGcmShared.m_nIoOffsetDelta;

	//uint nSurfaceFlipIndex = g_ps3gcmGlobalState.m_display.surfaceFlipIdx, nSurfaceFlipAltIndex = g_ps3gcmGlobalState.m_display.PrevSurfaceIndex();
	
	//CPs3gcmLocalMemoryBlock &altSurface = g_ps3gcmGlobalState.m_display.surfaceColor[nSurfaceFlipAltIndex];
	//V_memset( altSurface.DataInAnyMemory(), 0, altSurface.Size() );
	
	int nCurrentReplayFrame = 1;
	// Note: we probably shouldn't start with the frame rendering in the same surface as the last frame flipped..

	uint32 * pReplayLabelReset = (uint32*)g_spuGcmShared.m_sysring.EaPut();
	uint nReplayLabelResetIoOffset = uintp( pReplayLabelReset ) + g_spuGcmShared.m_nIoOffsetDelta;
	CellGcmContextData ghostCtx;
	ghostCtx.current = ghostCtx.begin = pReplayLabelReset;
	uint32 * pGhostAreaEnd = ghostCtx.end = ghostCtx.begin + ( nReserve / sizeof( uint32 ) );
	ghostCtx.callback = GhostGcmCtxCallback;
	
	cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY,  0  );
	
	uint32 * pReplayGhostArea = ghostCtx.current;
	uint nReplayGhostAreaIoOffset = uintp( pReplayGhostArea ) + g_spuGcmShared.m_nIoOffsetDelta;
	
	g_spuGcmShared.m_sysring.Put( uintp( pReplayGhostArea ) - uintp( pReplayLabelReset ) );
	Assert( g_spuGcmShared.m_sysring.EaPut() == uintp( pReplayGhostArea ) );
	
	volatile uint32 * pLabelReplay = cellGcmGetLabelAddress( GCM_LABEL_REPLAY );
	*pLabelReplay = 0xFFFFFFFF;
	__sync();
	
	bool isFirstIteration = true;

	do	
	{
		g_spuGcmShared.m_eaGcmControlRegister->put = nReplayGhostAreaIoOffset;

		while( *pLabelReplay != 0 )
			continue;
		// we're now synchronized at the beginning of ghost area
		switch( g_nTruePauseState )
		{
			case TRUE_PAUSE_NONE:
				return false;
			case TRUE_PAUSE_SINGLE_STEP:
				if( !isFirstIteration )
				{
					return true;
				}
				break;
		}

		const BeginFrameRecord_t &pastFrame = g_spuGcmShared.GetPastFrame( nCurrentReplayFrame );
		int nOffsetBeginFrame = uintp( pastFrame.m_eaBegin ) + g_spuGcmShared.m_nIoOffsetDelta, nOffsetEndFrame = uintp( pastFrame.m_eaEnd ) + g_spuGcmShared.m_nIoOffsetDelta;
		
		Msg("frame@ %X..%X ", nOffsetBeginFrame , nOffsetEndFrame );

		ghostCtx.current = ghostCtx.begin = pReplayGhostArea;
		ghostCtx.end = pGhostAreaEnd;

		*( ghostCtx.current++ ) = CELL_GCM_JUMP( nOffsetBeginFrame ); // jump to the beginning of the frame we want to replay
		uint32 nOffsetReturnFromFrame = uintp( ghostCtx.current ) + g_spuGcmShared.m_nIoOffsetDelta;

		Assert( pastFrame.m_eaEnd[0] == 0 && pastFrame.m_eaEnd[1] == 0 && pastFrame.m_eaEnd[2] == 0 && pastFrame.m_eaEnd[3] == 0 ); // we expect 4 NOPs at the end of the frame
		Assert( pastFrame.m_eaBegin[0] == 0 && pastFrame.m_eaBegin[1] == 0 && pastFrame.m_eaBegin[2] == 0 && pastFrame.m_eaBegin[3] == 0 ); // we expect 4 NOPs at the beginning of the frame
		pastFrame.m_eaEnd[0] = CELL_GCM_JUMP( nOffsetReturnFromFrame ); // return to replay area after rendering the whole frame
		cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY,  1  );
		
		__sync();
		uint32 nTickStart = __mftb(); // let's start rendering (replaying) the captured GCM frame 
		g_spuGcmShared.m_eaGcmControlRegister->put = uintp( ghostCtx.current ) + g_spuGcmShared.m_nIoOffsetDelta;
		while( *pLabelReplay != 1 )
			continue;		
		
		int nSurfaceFlipIndex = g_ps3gcmGlobalState.m_display.PrevSurfaceIndex( nCurrentReplayFrame ); 
		Assert( nSurfaceFlipIndex >= 0 );
		
		while ( cellGcmGetFlipStatus() != CELL_GCM_DISPLAY_FLIP_STATUS_DONE )
		{
			// Wait for the previous flip to completely finish
			ThreadSleep( 1 );
		}

		cellGcmResetFlipStatus();	// Need to reset GCM flip status

		// start flipping
		cellGcmSetFlip( &ghostCtx, nSurfaceFlipIndex );
		
		cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY,  2  );

		int nOffsetEndOfFlip = uintp( ghostCtx.current ) + g_spuGcmShared.m_nIoOffsetDelta;

		cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY,  3  ); // reset label

		*( ghostCtx.current++ ) = CELL_GCM_JUMP( nReplayLabelResetIoOffset );

		__sync();
		
		g_spuGcmShared.m_eaGcmControlRegister->put = nOffsetEndOfFlip;
		Msg( "[%d.%d] flip@ %X..%X. ", nCurrentReplayFrame, nSurfaceFlipIndex, nReplayGhostAreaIoOffset, nOffsetEndOfFlip );

		while( *pLabelReplay != 2 )
			continue;

		uint32 nFrameEnd = __mftb();	Msg( "%.2f ..ms.\n", ( nFrameEnd - nTickStart ) / 79800.0f );

		while ( cellGcmGetFlipStatus() != CELL_GCM_DISPLAY_FLIP_STATUS_DONE )
		{
			// Wait for the previous flip to completely finish
			ThreadSleep( 1 );
		}
		
		uint32 nFlipEnd = __mftb();	Msg( "%.2f ms.\n", ( nFlipEnd - nTickStart ) / 79800.0f );
		

		pastFrame.m_eaEnd[0] = CELL_GCM_METHOD_NOP;
		__sync();
		nCurrentReplayFrame = ( nCurrentReplayFrame + nReplayFrames - 1 ) % nReplayFrames;
		
		int bContinueProcessing = 0;
		CellPadData padData;
		do
		{
			int nError = cellPadGetData( 0, &padData );
			if( nError )
			{
				Msg( "Error 0x%X trying to get pad data, aborting true pause\n", nError );
				g_nTruePauseState = TRUE_PAUSE_NONE;
				return false;
			}
			else
			{
				if( padData.len >= 3 )
				{
					int isL1Down = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL2] & CELL_PAD_CTRL_R1;
					int isTriangleDown = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL1] & CELL_PAD_CTRL_UP;
					int isCrossDown = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL1] & CELL_PAD_CTRL_DOWN;
					int isCircleDown = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL1] & CELL_PAD_CTRL_RIGHT;
					
					bContinueProcessing = isTriangleDown; // go into infinite loop here if the triangle is down
					
					int isLockDown = isCrossDown, isSingleStepDown = isCircleDown, isPauseDown = isL1Down;

					if( g_nTruePauseState != TRUE_PAUSE_SINGLE_STEP && isSingleStepDown )
					{
						g_nTruePauseState = TRUE_PAUSE_SINGLE_STEP;
						bContinueProcessing = false; // return to render a single step
					}
					
					switch( g_nTruePauseState )
					{
					case TRUE_PAUSE_LOCKED1:
					case TRUE_PAUSE_LOCKED0:
						if( isPauseDown )
						{
							if( g_nTruePauseState == TRUE_PAUSE_LOCKED1 )
							{
								g_nTruePauseState = TRUE_PAUSE_NONE; // second press on the shoulder releases the lock
								bContinueProcessing = false;
							}
						}
						else
						{
							if( g_nTruePauseState == TRUE_PAUSE_LOCKED0 )
							{
								g_nTruePauseState = TRUE_PAUSE_LOCKED1; // promote: shoulder isn't pressed any more
							}
						}
						
						break;

					case TRUE_PAUSE_SPINNING:
						
						if( isLockDown )
						{
							g_nTruePauseState = TRUE_PAUSE_LOCKED0;
						}
						else if( isSingleStepDown )
						{
							g_nTruePauseState = TRUE_PAUSE_SINGLE_STEP;
							bContinueProcessing = false; // do the single step
						}
						else if( !isPauseDown )
						{
							if( isFirstIteration )
							{
								g_nTruePauseState = TRUE_PAUSE_LOCKED1;	 // assume we go into locked state if L1 wasn't pressed the very first frame
							}
							else
							{
								g_nTruePauseState = TRUE_PAUSE_NONE;
								bContinueProcessing = false;
							}
						}
						break;

					case TRUE_PAUSE_SINGLE_STEP:
						// we skipped one render frame; go into normal spinning state as soon as the user depresses circle
						if( !isSingleStepDown )
						{
							if( isPauseDown )
							{
								g_nTruePauseState = TRUE_PAUSE_SPINNING; // the shoulder is still down, so the user didn't decide yet if they want to let the game go
							}
							else
							{
								g_nTruePauseState = TRUE_PAUSE_LOCKED1; // we let the shoulder go, so it must be a locked state
							}
						}
						
						break;
					}
				}
			}
			isFirstIteration = false;
		}
		while( bContinueProcessing );
	}
	while( true );
	
	return false;
}

static ConVar spugcm_validatedeferredqueue( "spugcm_validatedeferredqueue", "0" );

void CSpuGcm::ValidateDeferredQueue()
{
#ifdef _DEBUG
	if( !spugcm_validatedeferredqueue.GetBool() )
		return;
	uint32 * pCmdEnd = m_pDeferredChunkHead;
	if( !pCmdEnd )
		pCmdEnd = m_pDeferredQueueCursors[0];
	pCmdEnd = m_spuDrawQueues[1].NormalizeCursor( pCmdEnd );
	Assert( m_spuDrawQueues[1].IsValidCursor( pCmdEnd ) );
	uint32 * pCmdBegin = m_pDeferredQueueCursors[ARRAYSIZE(m_pDeferredQueueCursors)-1];
	uint nWraps = 0;
	for( uint32 * pCmd = pCmdBegin; pCmd != pCmdEnd;  )
	{
		uint nCmd = *pCmd;
		if( nCmd == 0 )
		{
			pCmd++;
		}
		else if( ( nCmd & SPUDRAWQUEUE_METHOD_MASK ) == SPUDRAWQUEUE_NOPCOUNT_METHOD )
		{
			pCmd += 1 + ( nCmd & SPUDRAWQUEUE_NOPCOUNT_MASK );
		}
		else
		{
			Assert( IsValidDeferredHeader( nCmd ) );
			Assert( nWraps == 0 || pCmd < pCmdBegin );
			Assert( m_spuDrawQueues[ 1 ].IsValidCursor( pCmd ) );
			uint32 * pNext = ( uint32* )pCmd[ 1 ];
			Assert( m_spuDrawQueues[ 1 ].IsValidCursor( pNext ) );
			if( pNext < pCmd )
			{
				Assert( nWraps == 0 );
				nWraps++;
			}
			pCmd = pNext;
		}
		pCmd = m_spuDrawQueues[1].NormalizeCursor( pCmd );
	}
#endif
}