csgo/cstrike15_src/common/ps3/vjobchain.h

//========== Copyright � Valve Corporation, All rights reserved. ========
#if !defined( VJOBS_CHAINUTILS_HDR ) && defined( _PS3 )
#define VJOBS_CHAINUTILS_HDR

#include "tier0/platform.h"
#include <cell/spurs.h>
#include "ps3/job_notify.h"
struct VJobsRoot;

//
// The chain consists of blocks of commands; the head block has SYNC-JOB(notify)-GUARD sequence
// initially the chain waits on the GUARD , and is ready to "run". "Run" means releasing the guard.
// I'm using GUARD instead of JTS because patching JTS will render it unpatcheable until jobchain execution completes;
// and if it's unpatcheable, it effectively can't be used as a guard for the next cycle 
// Each block refers to the next one by inserting NEXT in the very last slot
// the last block is pointed to by m_pLastBlock, which is NULL initially (before entering the "run" state) 
// 
//
struct ALIGN128 VjobChain
{
public:
	cell::Spurs::JobChain m_spursJobChain;
	cell::Spurs::JobGuard m_guard;
	CellSpursJob64 m_jobNotify;
	job_notify::NotifyArea_t m_notifyArea;
	
	enum { BLOCK_COMMANDS = 256 };
	uint64 m_headBlock[BLOCK_COMMANDS+1]; // in one variant, the first entry in this list is used for END command; overwrite it with NOP to release the list
	uint64 * m_pLastBlock;
	uint m_nCurrentBlockCommands;
	uint m_nSpinWaitNotify;
	
	char m_name[16];
public:
	int Init( VJobsRoot * pRoot, uint nMaxContention, const char* pFormatName, ... );
	bool IsRunning()const { return m_pLastBlock != NULL; }
	
	int Run();
	void Push( uint64 nCommand );
	void Push( const uint64 * nCommands, uint nCommandCount );
	int End( );
	int Join();
	void Shutdown();
	
	JobChain & Jobchain() { return m_spursJobChain; }
	
}ALIGN128_POST;


// VjobChain2 hosts 2 jobchains and double-buffers between them
class VjobChain2
{
public:
	int Init( VJobsRoot * pRoot, uint nMaxContention, const char* pName );
	void Begin();															 
	void End();
	void Shutdown();

	VjobChain& Jobchain(){ return m_vjobChainRing[m_nCurrentChain]; }

protected:
	enum{VJOB_CHAINS = 2};
	VjobChain *m_vjobChainRing; // may be more than double-buffered if necessary
	uint m_nCurrentChain;
};


//#define VJOBCHAIN3_GUARD

struct ALIGN128 VjobBufferHeader_t
{
public:
	#ifdef VJOBCHAIN3_GUARD
	cell::Spurs::JobGuard m_guard;
	#endif
	CellSpursJob64 m_jobNotify;
	job_notify::NotifyArea_t m_notifyArea;
	#ifdef _DEBUG
	CellSpursJob64 m_jobNotify2;
	job_notify::NotifyArea_t m_notifyArea2;
	#endif
}
ALIGN128_POST;

struct VjobBuffer_t: public VjobBufferHeader_t
{
public:
	enum ConstEnum_t
	{
		VERBATIM_COMMAND_COUNT = 2 // we employ syncronization scheme: SYNC, JOB(notify),  ... 
	#ifdef VJOBCHAIN3_GUARD // we add GUARD, ...
		+ 1
	#endif
	#ifdef _DEBUG
		+ 1 // we add JOB(notify2), ... 
	#else
		
	#endif
	};
	uint64 m_spursCommands[16]; // there will be at least verbatim commands, a user command, and a NEXT

	void Init( VJobsRoot * pRoot, cell::Spurs::JobChain * pSpursJobChain );
};

// VjobChain3 has only 1 jobchain but double-buffers it to facilitate continuous wait-free execution
class VjobChain3
{
protected:
	enum ConstEnum_t {
		BUFFER_COUNT = 4
	};
	cell::Spurs::JobChain *m_pSpursJobChain;
	VjobBuffer_t * m_pBuffers[BUFFER_COUNT];
	VjobBuffer_t * m_pFrontBuffer;
	uint m_nFrontBuffer; // the buffer currently in use
	
	uint m_nMaxCommandsPerBuffer; // max count of commands fitting into one buffer
	uint m_nFrontBufferCommandCount; // count of commands in the current front buffer
	uint m_nSpinWaitNotify; // did we spin waiting for job_notify ? if we did, we probably need to increase the command buffer size
	uint64 m_nLastCommandPushed; // at the beginning of the scene, it's considered to be synced up

	
	const char * m_pName;

public:

	int Init( VJobsRoot * pRoot, uint nMaxContention, uint nMinCommandsPerBuffer, uint8_t nVjobChainPriority[8], const char* pName, uint nDmaTags );
	uint64* Push( uint64 nCommand );
	uint64* PushSyncJobSync( uint64 nCommand );
	void PushSync();
	void Shutdown(){End();Join();}
	void End();
	void Join();

protected:
	void WaitForEntryNotify( VjobBuffer_t * pBuffer );
	uint64* StartCommandBuffer( uint nNext1Buffer, uint64 nInsertCommand );
	uint64* SwapCommandBuffer( uint64 nInsertCommand );
};


inline uint64* VjobChain3::Push( uint64 nCommand )
{
	uint64 * pInsertionPoint;
	if( m_nFrontBufferCommandCount == m_nMaxCommandsPerBuffer - 1 )
	{
		// time to switch the buffer
		pInsertionPoint = SwapCommandBuffer( nCommand );
	}
	else
	{
		m_pFrontBuffer->m_spursCommands[ m_nFrontBufferCommandCount + 1 ] = CELL_SPURS_JOB_COMMAND_JTS;
		__lwsync();	// Important: this sync ensures that both the command header AND JTS are written before SPU sees them
		pInsertionPoint = &m_pFrontBuffer->m_spursCommands[ m_nFrontBufferCommandCount ];
		*pInsertionPoint = nCommand;
		m_nFrontBufferCommandCount ++;
	}
	m_nLastCommandPushed = nCommand;
	return pInsertionPoint;
}


inline void VjobChain3::PushSync()
{
	Push( CELL_SPURS_JOB_COMMAND_LWSYNC );
}


inline uint64* VjobChain3::PushSyncJobSync( uint64 nCommand )
{
	if( m_nLastCommandPushed != CELL_SPURS_JOB_COMMAND_LWSYNC )
	{
		// we need to wait for previous jobs to finish in order to patch the state efficiently
		// todo: double-buffer the states to avoid stalls, but only if we become SPU-bound here (un
		Push( CELL_SPURS_JOB_COMMAND_LWSYNC );
	}
	uint64 * pInsertionPoint = Push( nCommand );

	// this is instead of stalling successor because I'm not sure if it stalls all logical successors (some of which may be picked up by other SPUs)
	// the SYNC here will ensure completion of the previous job before the new jobs will be pushed
	Push( CELL_SPURS_JOB_COMMAND_LWSYNC );
	
	return pInsertionPoint;
}


#endif