|
|
//========= Copyright � 1996-2004, Valve LLC, All rights reserved. ============
//
// This is the common include file to be included in SPU jobs.
// It takes care to remap/emulate some SPU-specific functinality on PPU
//
#ifndef PS3_SPU_JOB_SHARED_HDR
#define PS3_SPU_JOB_SHARED_HDR
#ifdef _PS3
#include <ps3/ps3_platform.h>
#include <cell/spurs/job_chain.h>
#include <cell/spurs/job_queue.h>
#include <cell/spurs/job_queue_port2.h>
#include <cell/dma/types.h>
//
// NOTE: Enable the following block for debugging GCM on SPU; works as of SDK 350
//
#if 0 && defined( __SPU__ )
#include <cell/gcm/gcm_macros.h>
#undef CELL_GCM_ASSERT
#undef CELL_GCM_ASSERTS
#define CELL_GCM_ASSERT(condition) Assert( condition )
#define CELL_GCM_ASSERTS(condition, description) AssertSpuMsg( condition, description )
#define CELL_GCM_ASSERT_ENABLE
#endif
enum DmaTagEnum_t { DMATAG_SYNC = 2, // used for synchronous transfers, where we need the transfer to finish very soon/immediately after issuing
DMATAG_TEXTURES = 3, DMATAG_SHADERS = 4, DMATAG_SCRATCH = 5, // used for DMA PUTs from Scratch memory, so we need to wait for this to finish before job finishes
// each jobchain needs 2 dma tags, up to tag 30
// DMATAG_EDGE_JOBCHAIN = 8,
// DMATAG_FPCP_JOBCHAIN = 10,
// DMATAG_GCM_JOBCHAIN = 12,
DMATAG_ANIM = 8, // non immediate dma's
DMATAG_BUILDINDICES = 8, DMATAG_BUILDRENDERABLES = 8,
}; // shouldn't overlap with the tags used by the workload
// Enable this define to disable assert. This may be necessary to detect timing issues in DEBUG and RELEASE,
// or incorrectly generated code from compiler. When LSGUARD is enabled, we disable asserts to force potential issues.
#ifdef USE_LSGUARD
# define DISABLE_ASSERT
#endif
template <typename T> inline T* AddBytes( T* p, int nBytes ) { return ( T* )( int( p ) + nBytes ); }
template <typename T> inline T Min( T a, T b ) { return a < b ? a : b; }
template <typename T> inline T Max( T a, T b ) { return a > b ? a : b; }
template <typename T> inline void Swap( T& a , T & b ) { T c = a; a = b; b = c; }
// <sergiy> should I port platform.h to SPU?
#ifdef SPU
#include <cell/spurs/job_context.h>
#include "cell/spurs/common.h"
#include <cell/atomic.h>
#include <spu_intrinsics.h>
#include <vmx2spu.h>
#define PPU_ONLY(X)
#define SPU_ONLY(X) X
#define vector __vector
void CheckBufferOverflow_Impl(); void CheckDmaGet_Impl( const void * pBuffer, size_t nSize );
#if defined(_CERT) || defined(DISABLE_ASSERT)
# define VjobSpuLog(...)
# define DebuggerBreak()
# define Warning(...)
# define CheckBufferOverflow()
# define CheckDmaGet(p, size)
#else
# include <spu_printf.h>
# define VjobSpuLog( MSG, ... ) spu_printf( "[%d]" MSG, cellSpursGetCurrentSpuId(), ##__VA_ARGS__ )
# define Msg( MSG, ... ) spu_printf( "[%d]" MSG, cellSpursGetCurrentSpuId(), ##__VA_ARGS__ )
#ifndef BASETYPES_H
#define DebuggerBreak() __asm volatile ("stopd $0,$0,$0")
#endif
# define Warning( MSG, ... ) spu_printf( "[%d] Warning: " MSG, cellSpursGetCurrentSpuId(), ##__VA_ARGS__ )
# define CELL_DMA_ASSERT_VERBOSE
# define CheckBufferOverflow() CheckBufferOverflow_Impl()
# define CheckDmaGet(p, size) CheckDmaGet_Impl( p, size )
#endif
#define LWSYNC_PPU_ONLY()
#define VJOB_IOBUFFER_DMATAG g_stInfo->dmaTag // fake DMA tag
#include <cell/spurs/common.h>
#define VjobDmaPut cellDmaPut
#define VjobDmaGet cellDmaGet
#define VjobDmaGetf cellDmaGetf
#define VjobDmaListGet cellDmaListGet
#define VjobDmaLargePut cellDmaLargePut
#define VjobDmaLargePutf cellDmaLargePutf
//#define VjobDmaLargePutb cellDmaLargePutb
#define VjobDmaPutf cellDmaPutf
#define VjobDmaSmallPut cellDmaSmallPut
#define VjobDmaSmallPutf cellDmaSmallPutf
//#define VjobDmaSmallPutb cellDmaSmallPutb
#define VjobDmaSmallGet cellDmaSmallGet
#define VjobWaitTagStatusAll cellDmaWaitTagStatusAll
#define VjobWaitTagStatusImmediate cellDmaWaitTagStatusImmediate
#define VjobDmaGetUint32 cellDmaGetUint32
#define VjobDmaPutUint32 cellDmaPutUint32
#define VjobDmaGetUint64 cellDmaGetUint64
#define VjobDmaPutUint64 cellDmaPutUint64
#define VjobDmaUnalignedPutf cellDmaUnalignedPutf
#define VjobDmaUnalignedPut cellDmaUnalignedPut
#define VjobDmaPutfUintTemplate(SIZE, value, ea, tag, tid, rid) \
do { \ uint64_t __cellDma_ea = ea; \ uint32_t __cellDma_tag = tag; \ qword _buf = (qword)spu_splats(value); \ cellDmaDataAssert(__cellDma_ea,sizeof(uint##SIZE##_t),__cellDma_tag); \ cellDmaAndWait(cellDmaEa2Ls(__cellDma_ea,&_buf),__cellDma_ea,sizeof(uint##SIZE##_t),__cellDma_tag,MFC_CMD_WORD(tid,rid,MFC_PUTF_CMD)); \ } while(0)
#define VjobDmaPutfUint8(value, ea, tag) cellDmaPutUintTemplate(8, ((uint8_t)value), ea, tag, 0, 0)
#define VjobDmaPutfUint16(value, ea, tag) cellDmaPutUintTemplate(16, ((uint16_t)value), ea, tag, 0, 0)
#define VjobDmaPutfUint32(value, ea, tag) cellDmaPutUintTemplate(32, ((uint32_t)value), ea, tag, 0, 0)
#define VjobDmaPutfUint64(value, ea, tag) cellDmaPutUintTemplate(64, ((uint64_t)value), ea, tag, 0, 0)
#define VjobSpuId() int( cellSpursGetCurrentSpuId() )
#define V_memset __builtin_memset
#define V_memcpy __builtin_memcpy
#if !defined ARRAYSIZE
#define ARRAYSIZE( ARRAY ) ( sizeof( ARRAY ) / sizeof( ( ARRAY )[0] ) )
#endif
typedef signed int int32; typedef unsigned int uint; typedef signed char int8; typedef unsigned char uint8; typedef signed short int16; typedef unsigned short uint16; typedef signed int int32; typedef unsigned int uint32; typedef signed long long int64; typedef unsigned long long uint64; typedef unsigned int uintp;
typedef vector float fltx4 ;
#define INT_MAX 0x7fffffff
#define DECL_ALIGN(x) __attribute__( ( aligned( x ) ) )
#ifndef BASETYPES_H
#define ALIGN16 DECL_ALIGN(16)
#define ALIGN16_POST
#define ALIGN128 DECL_ALIGN(128)
#define ALIGN128_POST
template <typename T> inline T AlignValue( T val, uintp alignment ) { return ( T )( ( ( uintp )val + alignment - 1 ) & ~( alignment - 1 ) ); }
#define ALIGN_VALUE( val, alignment ) ( ( val + alignment - 1 ) & ~( alignment - 1 ) )
inline bool IsPowerOfTwo( uint x ) { return ( x & ( x - 1 ) ) == 0; }
#endif
#define FORCEINLINE inline /* __attribute__ ((always_inline)) */
#define IsPlatformPS3() 1
#define IsPlatformPS3_PPU() 0
#define IsPlatformPS3_SPU() 1
#define IsPlatformX360() 0
#define IsPlatformOSX() 0
#if !defined RESTRICT
#define RESTRICT
#endif
#define V_memset __builtin_memset
#define V_memcpy __builtin_memcpy
inline void VjobPpuRereadEA( uintp ea ){}
#if defined(_CERT) || defined(DISABLE_ASSERT)
#define Assert(x) ((void)(0))
#define AssertSpuMsg(x,MSG,...)((void)0)
#ifndef DBG_H
#define COMPILE_TIME_ASSERT( pred ) // to avoid any unpredictable affects in the optimizer
#endif
#else
#define DBGFLAG_ASSERT
#ifndef DBG_H
#define Assert(x) do{if( !( x ) ) { spu_printf( "Assert on SPU[%d](" #x ")\n", cellSpursGetCurrentSpuId() ); DebuggerBreak(); } }while(0)
#endif
#define AssertSpuMsg(x,MSG,...) do{if( !( x ) ) { spu_printf( "Assert on SPU[%d](" #x "), " MSG, cellSpursGetCurrentSpuId(), ## __VA_ARGS__ ); DebuggerBreak(); } }while(0)
#ifndef DBG_H
#define COMPILE_TIME_ASSERT( pred ) switch(0){case 0:case pred:;}
#endif
#endif
// mimic the PPU class on SPU
// template< int bytesAlignment, class T >
// class CAlignedNewDelete : public T
// {public:
// }
// WARNING: SLOWNESS. DO NOT USE IN PRODUCTION.
inline void DebugMemcpyEa( uint eaDest, uint eaSrc, uint nSize, void *lsScratch ) { Assert( ! ( 0xF & ( eaSrc | eaDest | nSize ) ) ); uint nBytesLeft = nSize, nOffset = 0; while( nBytesLeft ) { uint nChunk = Min<uint>( 16 * 1024, nBytesLeft ); VjobDmaGet( lsScratch, eaSrc + nOffset, nChunk, DMATAG_SYNC, 0, 0 ); VjobWaitTagStatusAll( 1 << DMATAG_SYNC ); VjobDmaPut( lsScratch, eaDest + nOffset, nChunk, DMATAG_SYNC, 0, 0 ); VjobWaitTagStatusAll( 1 << DMATAG_SYNC ); nBytesLeft -= nChunk; nOffset += nChunk; } }
#define vec_to_uint32(X) si_to_uint( ( qword )( X ) )
#define VjobQueuePort2PushJob( eaPort, eaJob, sizeDesc, tag, dmaTag, flag ) cellSpursJobQueuePort2PushJob( (uintp)( eaPort ), (uintp)( eaJob ), ( sizeDesc ), ( tag ), ( dmaTag ), ( flag ) )
#define VjobQueuePort2PushSync( eaPort2, tagMask, dmaTag, flag ) cellSpursJobQueuePort2PushSync( ( uintp ) ( eaPort2), ( tagMask ), ( dmaTag ), ( flag ) )
inline void VjobQueuePort2PushJobBlocking( CellSpursJobQueuePort2 *eaPort2, CellSpursJobHeader *eaJob, size_t sizeDesc, uint nQueueTag, uint nDmaTag ) { int nError;
for(;;) { nError = cellSpursJobQueuePort2PushJob( uintp( eaPort2 ), uintp( eaJob ) , sizeDesc, nQueueTag, nDmaTag, CELL_SPURS_JOBQUEUE_FLAG_NON_BLOCKING ); if( nError != CELL_SPURS_JOB_ERROR_AGAIN ) { break; } } if ( nError != CELL_OK ) { VjobSpuLog( "Cannot push job, error %d. RSX is going to hang, then SPUs, then PPU.\n", nError ); DebuggerBreak(); } }
inline void VjobQueuePort2PushSyncBlocking( CellSpursJobQueuePort2 *eaPort2, unsigned tagMask, uint nDmaTag ) { int nError;
for(;;) { nError = cellSpursJobQueuePort2PushSync( uintp( eaPort2 ), tagMask, nDmaTag, CELL_SPURS_JOBQUEUE_FLAG_NON_BLOCKING ); if( nError != CELL_SPURS_JOB_ERROR_AGAIN ) { break; } } if ( nError != CELL_OK ) { VjobSpuLog( "Cannot push job, error %d. RSX is going to hang, then SPUs, then PPU.\n", nError ); DebuggerBreak(); } }
#else
#include "tier0/platform.h"
#include "tier1/strtools.h"
#include "mathlib/ssemath.h"
#include <altivec.h>
#include <cell/spurs/job_context_types.h>
inline uint32_t GetCurrentSpuId() { return 0xFFFFFFFF; } using namespace ::cell::Spurs; extern void VjobSpuLog( const char * p, ... );
#define VJOB_IOBUFFER_DMATAG 0 // fake DMA tag
#define PPU_ONLY(X) X
#define SPU_ONLY(X)
#ifdef _DEBUG
#define AssertSpuMsg(x,MSG,...) do { if( !( x ) ) { Warning( "Assert(" #x "), " MSG, ## __VA_ARGS__ ); DebuggerBreak(); } }while( 0 )
#else
#define AssertSpuMsg(x,MSG,...)
#endif
#define VjobQueuePort2PushJob( eaPort, eaJob, sizeDesc, tag, dmaTag, flag ) cellSpursJobQueuePort2PushJob( (CellSpursJobQueuePort2 *)( eaPort ), (CellSpursJobHeader *)( eaJob ), ( sizeDesc ), ( tag ), ( flag ) )
#define VjobQueuePort2PushSync( eaPort2, tagMask, dmaTag, flag ) cellSpursJobQueuePort2PushSync( (CellSpursJobQueuePort2 *) ( eaPort2), ( tagMask ), ( flag ) )
inline void VjobQueuePort2PushJobBlocking( CellSpursJobQueuePort2 *eaPort2, CellSpursJobHeader *eaJob, size_t sizeDesc, uint nQueueTag, uint nDmaTag ) { int nError = cellSpursJobQueuePort2PushJob( eaPort2, eaJob, sizeDesc, nQueueTag, 0 );// synchronous call
(void) nError; Assert( nError == CELL_OK ); }
inline void VjobQueuePort2PushSyncBlocking( CellSpursJobQueuePort2 *eaPort2, unsigned tagMask, uint nDmaTag ) { int nError = cellSpursJobQueuePort2PushSync( eaPort2, tagMask, 0 ); // synchronous call
(void) nError; Assert( nError == CELL_OK ); }
#define VjobSpuId() -1
#define LWSYNC_PPU_ONLY() __lwsync()
extern void VjobDmaPut( const void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaGet( void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaGetf( void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaListGet( void *ls, uint64_t ea, const CellDmaListElement *list, uint32_t listSize, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaLargePut( const void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaLargePutf( const void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaLargePutb( const void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaPutf( const void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaSmallPut( const void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaSmallGet( void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaSmallPutb( const void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
extern void VjobDmaSmallPutf( const void * ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
// NOTE: implementation must wait for tag
uint32_t VjobDmaGetUint32( uint64_t ea, uint32_t tag, uint32_t tid, uint32_t rid );
void VjobDmaPutUint32( uint32_t value, uint64_t ea, uint32_t tag, uint32_t tid, uint32_t rid );
uint64_t VjobDmaGetUint64( uint64_t ea, uint32_t tag, uint32_t tid, uint32_t rid );
void VjobDmaPutUint64( uint64_t value, uint64_t ea, uint32_t tag, uint32_t tid, uint32_t rid );
void VjobDmaUnalignedPutf( const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
void VjobDmaUnalignedPut( const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid );
// These functions are empty because I'm too lazy to implement deferred DMA emulation ...
inline uint VjobWaitTagStatusAll( uint nTagMask ){ return nTagMask;} inline uint VjobWaitTagStatusImmediate( uint nTagMask ) { return nTagMask ; }
#define VjobDmaPutfUint8(value, ea, tag) *(uint8*)ea = (uint8)value
#define VjobDmaPutfUint16(value, ea, tag) *(uint16*)ea = (uint16)value
#define VjobDmaPutfUint32(value, ea, tag) *(uint32*)ea = (uint32)value
#define VjobDmaPutfUint64(value, ea, tag) *(uint64*)ea = (uint64)value
void VjobPushJob( void ( *pfnMain )( CellSpursJobContext2 * stInfo, CellSpursJob256 * job ), CellSpursJob128 * job ); extern void VjobSpuLog( const char * p, ... ); extern void VjobPpuRereadEA( uintp ea );
inline void DebugMemcpyEa( uint eaDest, uint eaSrc, uint nSize, void *lsScratch ) { Assert( ! ( 0xF & ( eaSrc | eaDest | nSize ) ) ); memcpy( (void*)eaDest, (void*)eaSrc, nSize ); }
extern void TestAlignBuffer();
#define vec_to_uint32(X) (*(uint32*)&(X))
#endif // SPU
#define VjobDmaEa2Ls16(ea, ls) ((uintptr_t)(ls)+((uint32_t)(ea)&15))
#define VjobDmaEa2Ls128(ea, ls) ((uintptr_t)(ls)+((uint32_t)(ea)&127))
inline uint32* PrepareSmallPut32( vector unsigned int * lsAligned, volatile uint32 * eaUnaligned, uint32 nInitialValue ) { Assert( !( 3 & uint( lsAligned ) ) ); uint32 * ls = ( uint32* )VjobDmaEa2Ls16( eaUnaligned, lsAligned ); *ls = nInitialValue; return ls; }
inline uint64* PrepareSmallPut64( vector unsigned int * lsAligned, volatile uint64 * eaUnaligned, uint64 nInitialValue ) { Assert( !( 7 & uint( lsAligned ) ) ); uint64 * ls = ( uint64* )VjobDmaEa2Ls16( eaUnaligned, lsAligned ); *ls = nInitialValue; return ls; }
extern CellSpursJobContext2* g_stInfo;
#ifndef IsDebug
# ifdef _DEBUG
# define IsDebug() true
# else
# define IsDebug() false
# endif
#endif
#ifndef IsCert
# ifdef _CERT
# define IsCert() true
# else
# define IsCert() false
# endif
#endif
extern uint g_nBreakMask ; #ifdef _CERT
# define BreakOn( nId )
#else
# define BreakOn( nId ) do \
{ \ if( g_nBreakMask & ( 1 << nId ) ) \ DebuggerBreak(); \ }while( 0 )
#endif
inline void VjobDebugSpinCycles( uint nCycles ) { if( !IsCert() ) { #ifdef SPU
uint nStart = spu_read_decrementer(); while( nStart - spu_read_decrementer() < nCycles / 40 ) continue; #else
sys_timer_usleep( nCycles / 3200 ); /*
uint nStart = __mftb(); while( __mftb() - nStart() < nCycles / 40 ) continue; */ #endif
} }
// this is the DMA list element without notify or reserved fields, so that it's easy to fill it in
// and be sure there is no garbage left (in notify and reserved fields) and there are no bit field operations (to store size, which is effectively only 14-bit value)
struct BasicDmaListElement_t { uint32 size; uint32 eal; };
// shifts unaligned pBuffer of given size left by 0..15 bytes to make it aligned
// returns the aligned pointer, pBuffer & -16
extern void* AlignBuffer( void * pBuffer, uint nBytes);
//
// Adds constant nAdd to the given unaligned buffer of uint16's
//
extern void UnalignedBufferAddU16( uint16 * pBuffer, uint nCount, uint16 nAdd );
// SpursJob_t must be one of CellSpursJob64, CellSpursJob128, CellSpursJob256,...
// JobParam_t is the parameter structure passed to the job
template < typename JobParam_t , typename SpursJob_t > inline JobParam_t * VjobGetJobParams( void * pJob ) { Assert( sizeof( JobParam_t ) + sizeof( CellSpursJobHeader ) <= sizeof( SpursJob_t ) ); JobParam_t * pJobParams = ( JobParam_t* ) ( uintp( pJob ) + ( sizeof( SpursJob_t ) - sizeof( JobParam_t ) ) ); Assert( uintp( pJobParams + 1 ) == uintp( pJob ) + sizeof( SpursJob_t ) ); return pJobParams; }
extern void UnalignedBufferAddU16( );
template <uint n> struct Log2{}; template<>struct Log2<8> {enum{VALUE=3};}; template<>struct Log2<16>{enum{VALUE=4};}; template<>struct Log2<32>{enum{VALUE=5};}; template<>struct Log2<256>{enum{VALUE=8};};
#define COMPILE_TIME_LOG2(VAL) ( Log2<VAL>::VALUE )
inline void ZeroMemAligned( void * p, uint nSize ) { Assert( !( ( uintp( p ) | nSize ) & 15 ) ); for( uint i = 0; i < nSize; i += 16 ) { *( vec_uint4* )( uintp( p ) + i ) = (vec_uint4){0,0,0,0}; } }
inline void CopyMemAligned( void * pDst, const void * pSrc, uint nSize ) { Assert( !( ( uintp( pDst ) | uintp( pSrc ) | nSize ) & 15 ) ); for( uint i = 0; i < nSize; i += 16 ) { *( vec_uint4* )( uintp( pDst ) + i ) = *( vec_uint4* )( uintp( pSrc ) + i ); } }
///////////////////////////////////////////////////////////////////////////
//
// Reference implementation
//
template <uint nBitCount> class CBitArray { public: void Clear() { for( uint i = 0; i < ( nBitCount >> 7 ); ++i ) { m_qword[i] = ( vec_uint4 ){0,0,0,0}; } //m_nSetCount = 0;
} void SetRange( uint nStart, uint nEnd ) { nEnd = Min( nEnd, nBitCount ); if( nStart > nEnd ) return; //m_nSetCount = Max( nEnd, m_nSetCount );
uint nMask = uint( -1 ) >> ( nStart & 0x1F ); for( uint i = ( nStart >> 5 ); i < ( nEnd >> 5); ++i ) { m_u32[i] |= nMask; nMask = uint( -1 ); } nMask &= ~( uint( -1 ) >> ( nEnd & 0x1F ) ); m_u32[ nEnd >> 5 ] |= nMask; } //uint GetSetCount()const{return m_nSetCount;}
uint GetFirst1( uint nFrom )const { for( uint i = nFrom; i < nBitCount; ++i ) if( GetBit( i ) ) return i; return nBitCount; }
uint GetFirst0( uint nFrom )const { for( uint i = nFrom; i < nBitCount; ++i ) if( !GetBit( i ) ) return i; return nBitCount; } uint GetBit( uint n )const { return m_u32[ n >> 5 ] & ( 0x80000000 >> ( n & 0x1F ) ); } protected: union { vec_uint4 m_qword[ ( nBitCount + 127 ) / 128 ]; uint32 m_u32[ ( nBitCount + 31 ) / 32 ]; }; //uint m_nSetCount;
};
#endif // _PS3
#endif
|