Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1955 lines
74 KiB

  1. //========== Copyright � Valve Corporation, All rights reserved. ========
  2. // This is the central hub for controlling SPU activities relating to
  3. // RSX/graphics processing/rendering
  4. //
  5. #include "spugcm.h"
  6. #include "ps3/ps3gcmmemory.h"
  7. #include "fpcpatcher_spu.h"
  8. #include "ps3gcmstate.h"
  9. #include "vjobs/root.h"
  10. #include "ps3/ps3gcmlabels.h"
  11. #include "ps3/vjobutils_shared.h"
  12. #include "vjobs/jobparams_shared.h"
  13. #include "vjobs/ibmarkup_shared.h"
  14. #include "inputsystem/iinputsystem.h"
  15. #include <sysutil/sysutil_common.h>
  16. #include <sysutil/sysutil_sysparam.h>
  17. #include <cell/pad.h>
  18. #include <materialsystem/imaterialsystem.h>
  19. #include "fpcpatcher_spu.h"
  20. #include "dxabstract.h"
  21. #include "rsxflip.h"
  22. extern IVJobs * g_pVJobs;
  23. CSpuGcmSharedState g_spuGcmShared;
  24. CSpuGcm g_spuGcm;
  25. static int s_nFinishLabelValue = 0, s_nStopAtFinishLabelValue = -1;
  26. CEdgeGeomRing g_edgeGeomRing;
  27. ApplicationInstantCountersInfo_t g_aici;
  28. CEdgePostWorkload g_edgePostWorkload;
  29. #define PCB_RING_CTX ( *gCellGcmCurrentContext )
  30. void FillNops( struct CellGcmContextData *context )
  31. {
  32. while( context->current < context->end )
  33. *( context->current++ ) = CELL_GCM_METHOD_NOP;
  34. }
  35. int32_t SpuGcmCommandBufferReserveCallback( struct CellGcmContextData *context, uint32_t nCount )
  36. {
  37. return g_spuGcm.OnGcmCommandBufferReserveCallback( context, nCount );
  38. }
  39. void SpuGcmDebugFinish( CellGcmContextData *thisContext )
  40. {
  41. Assert( thisContext == &PCB_RING_CTX );
  42. g_spuGcm.CmdBufferFinish();
  43. }
  44. void StallAndWarning( const char * pWarning )
  45. {
  46. sys_timer_usleep( 30 );
  47. if( g_spuGcmShared.m_enableStallWarnings )
  48. {
  49. Warning( "Stall: %s\n", pWarning );
  50. }
  51. }
  52. //#endif
  53. void CSpuGcm::CreateRsxBuffers()
  54. {
  55. //////////////////////////////////////////////////////////////////////////
  56. // Create Fragment program patch buffers
  57. //
  58. uint nFpcpRingBufferSize = CalculateMemorySizeFromCmdLineParam( "-fpcpRingSize", 512 * 1024, 32 * 1024 );
  59. Msg("Fpcp ring size: %d bytes \n", nFpcpRingBufferSize );
  60. m_fpcpRingBuffer.Alloc( kAllocPs3GcmShader, nFpcpRingBufferSize );
  61. g_spuGcmShared.m_fpcpRing.SetRsxBuffer( m_fpcpRingBuffer.DataInLocalMemory(), nFpcpRingBufferSize, nFpcpRingBufferSize / 4, nFpcpRingBufferSize / 4096 );
  62. uint nEdgeRingBufferSize = CalculateMemorySizeFromCmdLineParam( "-edgeRingSize", 2 * 1024 * 1024, 1536 * 1024 );
  63. Msg("Edge ring size: %d bytes\n", nEdgeRingBufferSize );
  64. m_edgeGeomRingBuffer.Alloc( kAllocPs3GcmEdgeGeomBuffer, nEdgeRingBufferSize );
  65. if( nEdgeRingBufferSize < 8 * EDGEGEOMRING_MAX_ALLOCATION )
  66. {
  67. Error( "EdgeGeom has ring buffer that won't fit 8 jobs, which is a minimum. %u ( %u ) < 8 * %u\n", nEdgeRingBufferSize, m_edgeGeomRingBuffer.Size(), EDGEGEOMRING_MAX_ALLOCATION );
  68. }
  69. if( nEdgeRingBufferSize < 6 * 8 * EDGEGEOMRING_MAX_ALLOCATION )
  70. {
  71. Warning( "EdgeGeom has ring buffer that may block job_edgegeom performance. %u ( %u ) < 6 SPUs * 8 segments * %u\n", nEdgeRingBufferSize, m_edgeGeomRingBuffer.Size(), EDGEGEOMRING_MAX_ALLOCATION );
  72. }
  73. }
  74. const vec_uint4 g_vuSpuGcmCookie = (vec_uint4){0x04291978,0xC00CC1EE,0x04291978,0xC00CC1EE};
  75. void CSpuGcm::CreateIoBuffers()
  76. {
  77. const uint nCmdBufferOverfetchSlack = 1024;
  78. uint nFpRingIoBufferSize = 16 * 1024;
  79. uint nFpRingBufferSize = Max( nFpRingIoBufferSize, nCmdBufferOverfetchSlack ); // this buffer is RSX-write-only, at the end of mapped memory, it acts as an overfetch slack, too, so it must be at least the size of the slack
  80. g_spuGcmShared.m_fpcpRing.SetIoBuffer( g_ps3gcmGlobalState.IoMemoryPrealloc( nFpRingIoBufferSize, nFpRingBufferSize ), nFpRingIoBufferSize );
  81. m_pMlaaBufferCookie = NULL;
  82. m_pMlaaBuffer = NULL;
  83. m_pMlaaBufferOut = NULL;
  84. m_pEdgePostRsxLock = NULL;
  85. if( !CommandLine()->FindParm( "-noMlaa" ) )
  86. //if( CommandLine()->FindParm( "-edgeMlaa" ) )
  87. {
  88. uint nSizeofEdgePostBuffer = g_ps3gcmGlobalState.GetRenderSurfaceBytes( 128 );
  89. m_pMlaaBuffer = g_ps3gcmGlobalState.IoMemoryPrealloc( 128, nSizeofEdgePostBuffer + sizeof( g_vuSpuGcmCookie ) + sizeof( uint32 ) * CPs3gcmDisplay::SURFACE_COUNT );
  90. if( m_pMlaaBuffer )
  91. {
  92. m_pMlaaBufferOut = m_pMlaaBuffer;//( void* )( uintp( m_pMlaaBuffer ) + nSizeofEdgePostBuffer );
  93. m_pMlaaBufferCookie = ( vec_uint4* ) ( uintp( m_pMlaaBufferOut ) + nSizeofEdgePostBuffer );
  94. *m_pMlaaBufferCookie = g_vuSpuGcmCookie;
  95. m_pEdgePostRsxLock = ( uint32* )( m_pMlaaBufferCookie + 1 );
  96. }
  97. else
  98. {
  99. // if MlaaBuffer is NULL, it just means we're in the pass of computing the IO memory requirements
  100. }
  101. }
  102. }
  103. //
  104. // memory optimization: IO memory has slack, use it if it's big enough
  105. //
  106. void CSpuGcm::UseIoBufferSlack( uint nIoBufferSlac )
  107. {
  108. uint nSpuDrawQueueSize = CalculateMemorySizeFromCmdLineParam( "-spuDrawRingSize", 512 * 1024, 32 * 1024 );
  109. Msg( "SPU draw queue size: %d Kb\n" , nSpuDrawQueueSize / 1024 );
  110. uint nSpuDrawQueueDeferredSize = CalculateMemorySizeFromCmdLineParam( "-spuDrawDeferredRingSize", 210 * 1024, 32 * 1024 );
  111. Msg( "SPU draw deferred queue size: %d Kb\n" , nSpuDrawQueueDeferredSize / 1024 );
  112. m_nSpuDrawQueueSelector = 0;
  113. m_spuDrawQueues[0].Init( nSpuDrawQueueSize, &g_spuGcmShared.m_nSpuDrawGet[0], OnSpuDrawQueueFlush, OnSpuDrawQueueStall );
  114. m_spuDrawQueues[1].Init( nSpuDrawQueueDeferredSize, &g_spuGcmShared.m_nSpuDrawGet[1], OnSpuDrawQueueFlushDeferred, OnSpuDrawQueueStallDeferredDelegator );
  115. for( uint i = 0; i < ARRAYSIZE( m_pDeferredStates ); ++i )
  116. m_pDeferredStates[i] = ( DeferredState_t * ) g_ps3gcmGlobalState.IoSlackAlloc( 128, sizeof( DeferredState_t ) );
  117. for( uint i = 0; i < ARRAYSIZE( m_pDeferredQueueCursors ); ++i )
  118. m_pDeferredQueueCursors[i] = m_spuDrawQueues[1].GetCursor();
  119. m_pDeferredQueueSegment = m_pDeferredQueueCursors[0];
  120. m_pDeferredChunkSubmittedTill[1] = m_spuDrawQueues[1].GetCursor();
  121. for( uint i = 0; i < ARRAYSIZE( m_spuDrawQueues ); ++i )
  122. g_spuGcmShared.m_nSpuDrawGet[i] = m_spuDrawQueues[i].GetSignal();
  123. }
  124. static fltx4 g_vertexProgramConstants[CELL_GCM_VTXPRG_MAX_CONST];
  125. // static uint s_nLastCtxBufferCookie = 0;
  126. // static uint s_nCtxBufferSegmentSubmitTime = 0; // divide by 2 and it'll be the weighted average of 79.8MHz ticks between segment submissions
  127. void CSpuGcm::OnGcmInit()
  128. {
  129. if( 127 & uintp( g_ps3gcmGlobalState.m_pLocalBaseAddress ) )
  130. {
  131. Error( "Local addresses map to main memory without proper 128-byte alignment! Some DMA assumptions are broken!!\n" );
  132. }
  133. if( 127 & uintp( g_ps3gcmGlobalState.m_nIoOffsetDelta ) )
  134. {
  135. Error( "IO addresses map to local memory without proper 128-byte alignment! Some DMA assumptions are broken!!\n" );
  136. }
  137. V_memset( &g_spuGcmShared.m_dxGcmState, 0, sizeof( g_spuGcmShared.m_dxGcmState ) );
  138. V_memset( &g_spuGcmShared.m_cachedRenderState, 0, sizeof( g_spuGcmShared.m_cachedRenderState ) );
  139. m_nPcbringWaitSpins = 0;
  140. m_pPcbringBuffer = NULL;
  141. m_eaLastJobThatUpdatesSharedState = 0;
  142. g_spuGcmShared.m_enableStallWarnings = ( CommandLine()->FindParm( "-enableStallWarnings" ) != 0 );
  143. g_spuGcmShared.m_edgeGeomFeeder.Init( m_edgeGeomRingBuffer.Size() );
  144. g_edgeGeomRing.Init( m_edgeGeomRingBuffer.DataInLocalMemory(), m_edgeGeomRingBuffer.Size(), g_ps3gcmGlobalState.m_nIoOffsetDelta, g_ps3gcmGlobalState.m_pLocalBaseAddress, GCM_LABEL_EDGEGEOMRING );
  145. g_spuGcmShared.m_eaEdgeGeomRing = &g_edgeGeomRing;
  146. g_spuGcmShared.m_fpcpRing.OnGcmInit( g_ps3gcmGlobalState.m_nIoOffsetDelta );
  147. g_spuGcmShared.m_nDrawLayerBits = g_spuGcmShared.LAYER_RENDER;
  148. g_spuGcmShared.m_nDrawLayerPredicates = g_spuGcmShared.LAYER_RENDER_AND_Z;
  149. g_spuGcmShared.m_nLastRsxInterruptValue = 0;
  150. if( m_pEdgePostRsxLock )
  151. {
  152. for( uint i = 0; i < CPs3gcmDisplay::SURFACE_COUNT; ++i )
  153. {
  154. m_pEdgePostRsxLock[i] = CELL_GCM_RETURN(); // assume previous flips already happened
  155. }
  156. }
  157. g_pVJobs->Register( this );
  158. m_zPass.Init();
  159. m_bUseDeferredDrawQueue = true;
  160. BeginGcmStateTransaction();
  161. g_pixelShaderPatcher.InitLocal( g_spuGcmShared.m_fpcpRing.GetRsxBuffer(), g_spuGcmShared.m_fpcpRing.GetRsxBufferSize() );
  162. g_spuGcmShared.m_eaFpcpSharedState = g_pixelShaderPatcher.m_state.m_pSharedState;
  163. g_spuGcmShared.m_nFpcpBufferMask = g_spuGcmShared.m_eaFpcpSharedState->m_nBufferMask;
  164. g_spuGcmShared.m_eaLocalBaseAddress = (uint32)g_ps3gcmGlobalState.m_pLocalBaseAddress;
  165. g_spuGcmShared.m_cachedRenderState.m_nDisabledSamplers = 0;
  166. g_spuGcmShared.m_cachedRenderState.m_nSetTransformBranchBits = 0;
  167. g_spuGcmShared.m_nDebuggerRunMask = SPUGCM_DEBUG_MODE ? 2 : 0;
  168. g_spuGcmShared.m_eaLastJobThatUpdatedMe = 0;
  169. g_spuGcmShared.m_nFpcPatchCounterOfLastSyncJob = g_pixelShaderPatcher.m_nFpcPatchCounterOfLastSyncJob;
  170. g_spuGcmShared.m_nFpcPatchCounter = g_pixelShaderPatcher.m_nFpcPatchCounter;
  171. g_spuGcmShared.m_nFpcpStartRangesAfterLastSync = g_spuGcmShared.m_eaFpcpSharedState->m_nStartRanges;
  172. g_spuGcmShared.m_eaZPassSavedState = NULL;
  173. g_spuGcmShared.m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine = g_ps3gcmGlobalState.m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine;
  174. g_spuGcmShared.m_eaPs3texFormats = g_ps3texFormats;
  175. g_spuGcmShared.m_eaVertexProgramConstants = g_vertexProgramConstants;
  176. m_nGcmFlushJobScratchSize = 0;
  177. m_nFrame = 0;
  178. // we shouldn't have used this format yet
  179. Assert( g_ps3texFormats[PS3_TEX_CANONICAL_FORMAT_COUNT].m_gcmPitchPer4X == 0 );
  180. Assert( g_ps3texFormats[PS3_TEX_CANONICAL_FORMAT_COUNT-1].m_gcmPitchPer4X != 0 );
  181. Assert( !( 0xF & uintp( g_spuGcmShared.m_eaPs3texFormats ) ) );
  182. Assert( g_spuGcmShared.m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine );
  183. COMPILE_TIME_ASSERT( !GCM_CTX_UNSAFE_MODE );
  184. {
  185. m_pFinishLabel = cellGcmGetLabelAddress( GCM_LABEL_SPUGCM_FINISH );
  186. *m_pFinishLabel = s_nFinishLabelValue;
  187. uint nSysringBytes = g_ps3gcmGlobalState.m_nCmdSize - SYSTEM_CMD_BUFFER_RESERVED_AREA - 16 - sizeof( SysringWrapSequence::Tail_t ); // 16 bytes for the JTN to wrap the buffer around, and to be able to DMA it in 16-byte chunks
  188. nSysringBytes &= -16; // make it 16-byte aligned
  189. uint eaSysringBuffer = uintp( g_ps3gcmGlobalState.m_pIoAddress ) + SYSTEM_CMD_BUFFER_RESERVED_AREA;
  190. uint32 * pSysringBufferEnd = ( uint32* )( eaSysringBuffer + nSysringBytes );
  191. *pSysringBufferEnd = // this is not strictly needed...
  192. g_spuGcmShared.m_sysringWrap.m_tail.m_nJumpToBegin = CELL_GCM_JUMP( SYSTEM_CMD_BUFFER_RESERVED_AREA );
  193. V_memset( g_spuGcmShared.m_sysringWrap.m_tail.m_nNops, 0, sizeof( g_spuGcmShared.m_sysringWrap.m_tail.m_nNops ) );
  194. Assert( !( 0xF & uint( &g_spuGcmShared.m_sysringWrap ) ) );
  195. //COMPILE_TIME_ASSERT( SPUGCM_USE_SET_REFERENCE_FOR_SYSRING_SIGNAL );
  196. //g_spuGcmShared.m_pEaSysringEndLabel = ( uint32* ) cellGcmGetLabelAddress( GCM_LABEL_SYSRING_END );
  197. //*g_spuGcmShared.m_pEaSysringEndLabel = g_spuGcmShared.m_sysring.m_nEnd; // pretend we finished all processing
  198. //g_spuGcmShared.m_nSysringSegmentWords = ( g_ps3gcmGlobalState.m_nCmdSize - nSysringCmdBufferSystemArea ) / sizeof( uint32 ) / g_spuGcmShared.NUM_SYSTEM_SEGMENTS;
  199. //g_spuGcmShared.m_nSysringSegmentWords &= -16; // make it aligned, at least -4 words but may be more for easier debugging (more round numbers)
  200. g_spuGcmShared.m_nIoOffsetDelta = g_ps3gcmGlobalState.m_nIoOffsetDelta;
  201. g_spuGcmShared.m_nSysringWaitSpins = 0;
  202. g_spuGcmShared.m_nSysringPuts = 0;
  203. g_spuGcmShared.m_nSysringSegmentSizeLog2 = 29 - __cntlzw( g_ps3gcmGlobalState.m_nCmdSize ); // make 4 subsegments; guarantee segment switch whenever the ring wraps around
  204. // we need AT LEAST 2 segments and each segment must be AT LEAST 1kb - for performant and reliable operation;
  205. Assert( ( g_ps3gcmGlobalState.m_nCmdSize >> g_spuGcmShared.m_nSysringSegmentSizeLog2 ) > 2 && ( g_ps3gcmGlobalState.m_nCmdSize >> g_spuGcmShared.m_nSysringSegmentSizeLog2 ) < 8 && g_spuGcmShared.m_nSysringSegmentSizeLog2 >= 10 );
  206. //g_spuGcmShared.m_nSysringPut = 0;
  207. //g_spuGcmShared.m_nSysringEnd = g_spuGcmShared.NUM_SYSTEM_SEGMENTS; // pretend we got the whole buffer already
  208. g_spuGcmShared.m_nDebuggerBreakMask = 0x00000000;
  209. g_spuGcmShared.m_nDebugLastSeenGet = 0xFEFEFEFE;
  210. uint nPcbringSize = SPUGCM_DEFAULT_PCBRING_SIZE;
  211. COMPILE_TIME_ASSERT( !( SPUGCM_DEFAULT_PCBRING_SIZE & ( SPUGCM_DEFAULT_PCBRING_SIZE - 1 ) ) );
  212. g_spuGcmShared.m_nPcbringSize = nPcbringSize ;
  213. // 12 extra bytes are allocated for buffer alignment code to avoid writing past end of the buffer ; 4 more bytes are for the cookie
  214. //m_pPcbringBuffer = ( uint32 * )MemAlloc_AllocAligned( nPcbringSize + 12 + 4, 0x10 );
  215. //*AddBytes( m_pPcbringBuffer, g_spuGcmShared.m_nPcbringSize + 12 ) = 0x1234ABCD;
  216. m_nPcbringBegin = 0;
  217. g_spuGcmShared.m_nPcbringEnd = g_spuGcmShared.m_nPcbringSize; // consider the full ring buffer already processed on SPU and free: this End is the end of "free to use" area
  218. // these is the max count of words needed to align the cmd buffer and insert any write-labels/set-reference-values
  219. // we need to add at least 3 to the count, in case we align current pointer in the process ( because we may need to submit )
  220. // also, we want this segment size to fit inside the between-segment signal
  221. m_nMaxPcbringSegmentBytes = Min<uint>( ( ( nPcbringSize - 32 - SPUGCM_SIZEOF_SYSRING_ENDOFSEGMENT_SIGNAL_COMMAND ) / 4 ) & -16, ( 1 << g_spuGcmShared.m_nSysringSegmentSizeLog2 ) - SPUGCM_SIZEOF_SYSRING_ENDOFSEGMENT_SIGNAL_COMMAND - 12 ); //
  222. // we definitely need PCBring segment to fit well into local store
  223. m_nMaxPcbringSegmentBytes = Min<uint>( m_nMaxPcbringSegmentBytes, SPUGCM_LSRING_SIZE / 2 );
  224. m_nMaxPcbringSegmentBytes = Min<uint>( m_nMaxPcbringSegmentBytes, SPUGCM_MAX_PCBRING_SEGMENT_SIZE );
  225. m_nMaxPcbringSegmentBytes &= -16; // make it 16-byte aligned..
  226. cellGcmReserveMethodSize( gCellGcmCurrentContext, 3 ); // we need at most ( 2 words for reference command + ) 3 words for alignment
  227. // align the buffer on 16-byte boundary, because we manage it in 16-byte increments
  228. while( 0xF & uintp( gCellGcmCurrentContext->current ) )
  229. {
  230. *( gCellGcmCurrentContext->current++ ) = CELL_GCM_METHOD_NOP;
  231. }
  232. g_spuGcmShared.m_sysring.Init( eaSysringBuffer, nSysringBytes, uint( gCellGcmCurrentContext->current ) - eaSysringBuffer );
  233. g_spuGcmShared.m_sysringRo.Init( GCM_LABEL_SYSRING_SIGNAL );
  234. g_spuGcmShared.m_nSysringWrapCounter = 0;
  235. g_spuGcmShared.m_eaGcmControlRegister = cellGcmGetControlRegister();
  236. g_spuGcmShared.m_eaSysringLabel = cellGcmGetLabelAddress( GCM_LABEL_SYSRING_SIGNAL );
  237. g_spuGcmShared.m_eaDebugLabel[0] = cellGcmGetLabelAddress( GCM_LABEL_DEBUG0 );
  238. g_spuGcmShared.m_eaDebugLabel[1] = cellGcmGetLabelAddress( GCM_LABEL_DEBUG1 );
  239. g_spuGcmShared.m_eaDebugLabel[2] = cellGcmGetLabelAddress( GCM_LABEL_DEBUG2 );
  240. *g_spuGcmShared.m_eaSysringLabel = g_spuGcmShared.m_sysring.GetSignal(); // pretend we executed WriteLabel
  241. g_spuGcmShared.m_nLastSignal = g_spuGcmShared.m_sysring.GetInvalidSignal();
  242. #if SPU_GCM_DEBUG_TRACE
  243. g_spuGcmShared.m_nDebugTraceBufferNext = 0;
  244. g_spuGcmShared.m_eaDebugTraceBuffer = ( SpuGcmDebugTrace_t* )MemAlloc_AllocAligned( g_spuGcmShared.DEBUG_BUFFER_COUNT * sizeof( SpuGcmDebugTrace_t ), 16 );
  245. #endif
  246. if( SPUGCM_USE_SET_REFERENCE_FOR_SYSRING_SIGNAL )
  247. {
  248. g_spuGcmShared.m_eaGcmControlRegister->ref = g_spuGcmShared.m_sysring.m_nEnd;// pretend we finished all processing
  249. }
  250. #ifdef _DEBUG
  251. m_nJobsPushed = 0;
  252. // fill in JTS in the rest of the buffer
  253. for( uint32 * pSlack = gCellGcmCurrentContext->current; pSlack < pSysringBufferEnd; ++pSlack )
  254. *pSlack = CELL_GCM_JUMP( uintp( pSlack ) - uintp( g_ps3gcmGlobalState.m_pIoAddress ) );
  255. #endif
  256. // set reference BEFORE we switch to sysring
  257. uint nGcmPut = uintp( gCellGcmCurrentContext->current ) + g_spuGcmShared.m_nIoOffsetDelta;
  258. Assert( !( 0xF & nGcmPut ) );
  259. __sync();
  260. g_spuGcmShared.m_eaGcmControlRegister->put = nGcmPut;
  261. // wait for RSX to reach this point, then switch to the new command buffer scheme
  262. int nAttempts = 0;
  263. while( g_spuGcmShared.m_eaGcmControlRegister->get != nGcmPut )
  264. {
  265. sys_timer_usleep(1000);
  266. if( ++nAttempts > 1000 )
  267. {
  268. Warning( "Cannot properly wait for RSX in OnGcmInit(%X!=%X); assuming everything's all right anyway.\n", g_spuGcmShared.m_eaGcmControlRegister->get, nGcmPut );
  269. break; // don't wait forever..
  270. }
  271. }
  272. //////////////////////////////////////////////////////////////////////////
  273. // Switch to PPU Command Buffer RING
  274. //
  275. // set reference BEFORE we switch to sysring; wait for all RSX initialization to go through before switching
  276. PCB_RING_CTX.begin = PCB_RING_CTX.current = NULL;//m_pPcbringBuffer;
  277. // we need to at least double-buffer to avoid deadlocks while waiting to submit a Pcbring segment
  278. // Each segment ends with a reference value update, and we need that update to unblock a piece of memory for use by subsequent submits
  279. Assert( GetMaxPcbringSegmentBytes() <= nPcbringSize / 2 );
  280. PCB_RING_CTX.end = NULL;//AddBytes( m_pPcbringBuffer, GetMaxPcbringSegmentBytes() );
  281. PCB_RING_CTX.callback = SpuGcmCommandBufferReserveCallback;
  282. #ifdef CELL_GCM_DEBUG // [
  283. gCellGcmDebugCallback = SpuGcmDebugFinish;
  284. cellGcmDebugCheckEnable( CELL_GCM_TRUE );
  285. #endif // ]
  286. }
  287. }
  288. inline signed int CSpuGcm::GetPcbringAvailableBytes()const
  289. {
  290. int nReallyAvailable = int32( *(volatile uint32*)&g_spuGcmShared.m_nPcbringEnd ) - int32( m_nPcbringBegin );
  291. #ifdef DBGFLAG_ASSERT
  292. Assert( uint( nReallyAvailable ) <= g_spuGcmShared.m_nPcbringSize );
  293. static int s_nLastPcbringAvailableBytes = -1;
  294. s_nLastPcbringAvailableBytes = nReallyAvailable;
  295. #endif
  296. Assert( nReallyAvailable >= 0 );
  297. return nReallyAvailable;
  298. }
  299. int CSpuGcm::OnGcmCommandBufferReserveCallback( struct CellGcmContextData *context, uint32_t nReserveCount )
  300. {
  301. FillNops(context);
  302. // IMPORTANT: we only allocate the necessary number of words here, no more no less
  303. // if we over-allocate, we may end up reordering commands in SPU draw queue following after GCM_FUNC commands
  304. uint nReserve = nReserveCount;
  305. uint32 * pDrawQueueCommand = GetDrawQueue()->AllocWords( nReserve + 1 );
  306. *pDrawQueueCommand = SPUDRAWQUEUE_GCMCOMMANDS_METHOD | nReserve;
  307. context->begin = context->current = pDrawQueueCommand + 1;
  308. context->end = context->begin + nReserve;
  309. if( IsDebug() )
  310. V_memset( context->current, 0xFE, nReserve * 4 );
  311. return CELL_OK;
  312. }
  313. void CSpuGcm::BeginGcmStateTransaction()
  314. {
  315. m_nCurrentBatch = BATCH_GCMSTATE;
  316. SetCurrentBatchCursor( GetDrawQueue()->GetCursor() );
  317. }
  318. void CSpuGcm::PushStateFlushJob( SpuDrawQueue * pDrawQueue, uint nResultantSpuDrawQueueSignal, uint32 *pCursorBegin, uint32 * pCursorEnd )
  319. {
  320. // only submit the job if there are any commands in the state command buffer
  321. CellSpursJob128 * pJob = m_jobPool128.Alloc( *m_pRoot->m_pGcmStateFlush );
  322. job_gcmstateflush::JobParams_t * pJobParams = job_gcmstateflush::GetJobParams( pJob );
  323. pJob->header.useInOutBuffer = 1;
  324. CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
  325. dmaConstructor.AddInputDma( sizeof( g_spuGcmShared ), &g_spuGcmShared ); // dma[0]; must be the first to be 128-byte aligned for atomics
  326. uint nSizeofDrawQueueUploadBytes = pDrawQueue->Collect( pCursorBegin, pCursorEnd, dmaConstructor );
  327. Assert( !( nSizeofDrawQueueUploadBytes & 3 ) );
  328. dmaConstructor.AddSizeInOrInOut( 48 + SPUGCM_LSRING_SIZE ); // 16 bytes for alignment; 16 for lsZero; 16 for lsTemp;
  329. COMPILE_TIME_ASSERT( sizeof( CPs3gcmTextureLayout::Format_t ) == 16 );
  330. dmaConstructor.AddCacheDma( g_nPs3texFormatCount * sizeof( CPs3gcmTextureLayout::Format_t ), g_ps3texFormats ) ;
  331. dmaConstructor.FinishIoBuffer( &pJob->header, pJobParams );
  332. pJobParams->m_nSkipDrawQueueWords = ( uintp( pCursorBegin ) / sizeof( uint32 ) ) & 3;
  333. pJobParams->m_nSizeofDrawQueueUploadWords = nSizeofDrawQueueUploadBytes / sizeof( uint32 ) ;
  334. Assert( uint( pJobParams->m_nSizeofDrawQueueUploadWords ) == nSizeofDrawQueueUploadBytes / sizeof( uint32 ) ); // make sure it fits into uint16
  335. pJobParams->m_nSpuDrawQueueSignal = nResultantSpuDrawQueueSignal;
  336. #ifdef DBGFLAG_ASSERT
  337. SpuDrawQueue * pSignalDrawQueue = &m_spuDrawQueues[ nResultantSpuDrawQueueSignal & 3 ? 1 : 0 ]; (void)pSignalDrawQueue;
  338. Assert( pSignalDrawQueue->IsValidCursor( (uint32*)( nResultantSpuDrawQueueSignal & ~3 ) ) );
  339. #endif
  340. uint nResultantSpuDrawQueueIndex = nResultantSpuDrawQueueSignal & 3;
  341. m_pDeferredChunkSubmittedTill[ nResultantSpuDrawQueueIndex ] = ( uint32* )( nResultantSpuDrawQueueSignal & ~3 );
  342. Assert( CELL_OK == cellSpursCheckJob( (const CellSpursJob256 *)pJob, sizeof( *pJob ), 256 ) );
  343. m_eaLastJobThatUpdatesSharedState = ( uintp )pJob;
  344. pJob->header.sizeScratch = m_nGcmFlushJobScratchSize;
  345. m_nGcmFlushJobScratchSize = 0;
  346. PushSpuGcmJob( pJob );
  347. if( SPUGCM_DEBUG_MODE )
  348. {
  349. // in SPUGCM_DEBUG_MODE, we execute all jobs and wait for them to complete. So, the GET pointer should always trail our pNext pointer
  350. Assert( g_spuGcmShared.m_nSpuDrawGet[nResultantSpuDrawQueueIndex] == ( nResultantSpuDrawQueueSignal & ~3 ) );
  351. }
  352. }
  353. void CSpuGcm::GcmStateFlush( )
  354. {
  355. Assert( m_nCurrentBatch == BATCH_GCMSTATE );
  356. if( IsDeferredDrawQueue() )
  357. {
  358. Warning( "Unexpected Flush in deferred spu draw queue\n" );
  359. OpenDeferredChunk();
  360. }
  361. else
  362. {
  363. if( GetCurrentBatchCursor() != GetDrawQueue()->GetCursor() )
  364. {
  365. FillNops( &PCB_RING_CTX );
  366. Assert( GetDrawQueue() == &m_spuDrawQueues[0] );
  367. PushStateFlushJob( &m_spuDrawQueues[0], m_spuDrawQueues[0].GetSignal(), GetCurrentBatchCursor(), GetDrawQueue()->GetCursor() );
  368. BeginGcmStateTransaction();
  369. ZPassCheckpoint( 6 );
  370. }
  371. }
  372. }
  373. void CSpuGcm::PushSpuGcmJob( CellSpursJob128 * pJob )
  374. {
  375. #ifdef _DEBUG
  376. m_nJobsPushed++;
  377. #endif
  378. PushSpuGcmJobCommand( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
  379. if( SPUGCM_DEBUG_MODE )
  380. {
  381. if( !m_zPass )
  382. {
  383. // in ZPass_Z the job doesn't free its descriptor
  384. // in ZPass_Render, we don't start the jobs through here
  385. // so we can't use this spin-wait to wait for the job to complete
  386. while( *( volatile uint64* )&pJob->header.eaBinary )
  387. {
  388. sys_timer_usleep( 60 );
  389. }
  390. }
  391. while( g_spuGcmShared.m_eaLastJobThatUpdatedMe != uintp( pJob ) )
  392. {
  393. sys_timer_usleep( 60 );
  394. }
  395. }
  396. }
  397. void CSpuGcm::PushSpuGcmJobCommand( uint64 nCommand )
  398. {
  399. if( m_zPass )
  400. {
  401. m_zPass.PushCommand( nCommand );
  402. }
  403. else
  404. {
  405. m_jobSink.PushSyncJobSync( nCommand );
  406. }
  407. }
  408. void CSpuGcm::ZPassCheckpoint( uint nReserveSlots )
  409. {
  410. if( m_zPass )
  411. {
  412. uint nFreeSubchainSlots = m_zPass.GetSubchainCapacity();
  413. if( nFreeSubchainSlots < 2 * nReserveSlots )
  414. {
  415. ExecuteOnce( Warning("Aborting Z prepass: not enough room for commands in zpass sub-job-chain (%d left).\n", nFreeSubchainSlots ) );
  416. AbortZPass(); // initiate Abort sequence of ZPass; reentrant
  417. }
  418. uint nFreeJobDescriptors = m_jobPool128.GetReserve( m_zPass.m_nJobPoolMarker );
  419. if( nFreeJobDescriptors < nReserveSlots )
  420. {
  421. ExecuteOnce( Warning("Aborting Z prepass: not enough room for job descriptors in m_jobPool128 (%d left)\n", nFreeJobDescriptors ) );
  422. AbortZPass();
  423. }
  424. }
  425. }
  426. void CSpuGcm::OnSetPixelShaderConstant()
  427. {
  428. Assert( !IsDeferredDrawQueue() );
  429. if( m_zPass )
  430. {
  431. if( !m_zPass.m_isInEndZPass )
  432. {
  433. if( g_pixelShaderPatcher.GetJournalSpaceLeftSince( m_zPass.m_nFpcpStateEndOfJournalIdxAtZPassBegin ) < 512 )
  434. {
  435. ExecuteOnce( Warning( "Performance Warning: Too many pixel shader constants set inside ZPass; aborting ZPass\n" ) );
  436. AbortZPass();
  437. }
  438. }
  439. }
  440. else
  441. {
  442. // we have space for 48kB (3k of constants) in FPCP;
  443. // every SetPixelShaderConstant may add 97 constants (96 values, 1 header)
  444. if( g_pixelShaderPatcher.GetJournalSpaceUsedSince( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob ) > ( 32*1024 / 16 ) || g_pixelShaderPatcher.GetJournalSpaceLeftSince( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob ) < 512 )
  445. {
  446. ExecuteOnce( Warning("Performance Warning: SetPixelShaderConstantF called for %d constants, but no draw calls were issued. Flushing FPCP state.\n", g_pixelShaderPatcher.GetJournalSpaceUsedSince( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob ) ) );
  447. // flush GCM with only one purpose: make it flush the patcher
  448. GetDrawQueue()->Push2( SPUDRAWQUEUE_FLUSH_FPCP_JOURNAL, g_pixelShaderPatcher.GetStateEndOfJournalIdx() );
  449. GcmStateFlush();
  450. }
  451. }
  452. }
  453. void CSpuGcm::OnSpuDrawQueueStallDeferredDelegator( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint nWords )
  454. {
  455. g_spuGcm.OnSpuDrawQueueStallDeferred( pDrawQueue, pGet, nWords );
  456. }
  457. void CSpuGcm::OnSpuDrawQueueStallDeferred( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint nWords )
  458. {
  459. // we need to try to wait for the previous deferred batch to finish
  460. // in any case we should be prepared for "out of space" condition
  461. // in which case we'll just execute all deferred commands right now
  462. if( pGet == m_pDeferredChunkSubmittedTill[1] )
  463. {
  464. // we have nothing else to wait for, we need to free the space by executing deferred commands now
  465. // full flush (this frame only, since previous frame was flushed the first time we called DrawQueueDeferred()
  466. FillNops( &PCB_RING_CTX ); // switching draw queues, preallocated gcm context no longer usable
  467. // the only deferred chunk that can resize is GCMFLUSH
  468. // and handling it is pretty easy: we can either execute whatever it collected so far
  469. if( m_pDeferredChunkHead )
  470. {
  471. // sanity check: we shouldn't have chunks as big as 64KB
  472. Assert( m_spuDrawQueues[1].Length( m_pCurrentBatchCursor[1], m_pDeferredChunkHead ) <= 64*1024 );
  473. Assert( *m_pDeferredChunkHead == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD && m_pDeferredChunkHead == m_pDeferredQueueCursors[0] );
  474. }
  475. // temporarily switch to normal queue state in order to replay the deferred queue commands and purge them
  476. uint32 * pDeferredQueueSegment = m_pDeferredQueueSegment;
  477. m_nSpuDrawQueueSelector = 0;
  478. Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );
  479. BeginGcmStateTransaction(); // this transaction is beginning in Normal draw queue; Deferred queue is currently in "frozen" state (almost out of memory)
  480. g_flipHandler.QmsAdviceBeforeDrawPrevFramebuffer();
  481. // flush previous frame first, and if it doesn't change Get , flush this frame
  482. ExecuteDeferredDrawQueue( 1 );
  483. extern void DxDeviceForceUpdateRenderTarget( );
  484. DxDeviceForceUpdateRenderTarget( ); // recover main render target, as it was screwed up by execution of previous frame's commands
  485. ExecuteDeferredDrawQueue( 0 );
  486. m_nFramesToDisableDeferredQueue = 1;
  487. // return to the deferred state after purging the queue. During purging the deferred queue, DrawQueue(Normal|Deferred) could not have been called
  488. // this "unfreezes" the deferred queue, which should by now be almost-all-free( or pending, depending on how fast SPUs will chew through it)
  489. Assert( m_pDeferredQueueSegment == pDeferredQueueSegment );
  490. // we executed up to this point (last opened chunk), we discard everything before it.
  491. // the last opened chunk is perfectly fine to begin the queue segment, so we pretend we began deferred queue there
  492. m_pDeferredQueueSegment = m_pDeferredQueueCursors[0];
  493. m_nSpuDrawQueueSelector = 1;
  494. }
  495. }
  496. void CSpuGcm::OnSpuDrawQueueFlushDeferred( SpuDrawQueue *pDrawQueue )
  497. {
  498. // break up long GCM chunks
  499. Assert( pDrawQueue == g_spuGcm.GetDrawQueue() );
  500. Assert( !g_spuGcm.m_pDeferredChunkHead || ( *g_spuGcm.m_pDeferredChunkHead & ~SPUDRAWQUEUE_DEFERRED_GCMFLUSH_MASK ) == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD ); // this is the only chunk we allocate incrementally
  501. // prevent this from being called recursively: reset flush watermark before doing anything else
  502. pDrawQueue->SetFlushWatermarkFrom( pDrawQueue->GetCursor() );
  503. g_spuGcm.OpenDeferredChunk();
  504. }
  505. void CSpuGcm::OnSpuDrawQueueStall( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint32 nWords )
  506. {
  507. Assert( pDrawQueue == &g_spuGcm.m_spuDrawQueues[0] );
  508. StallAndWarning( "SpuDrawQueue stall: PPU is waiting for SPU, and SPU is probably waiting for RSX\n"/*, nWords, pGet, g_spuGcm.m_spuDrawQueues[0].GetCursor()*/ );
  509. }
  510. void CSpuGcm::OnSpuDrawQueueFlush( SpuDrawQueue *pDrawQueue )
  511. {
  512. // currently, there's only one and it's
  513. Assert( pDrawQueue == g_spuGcm.GetDrawQueue() );
  514. g_spuGcm.GcmStateFlush();
  515. }
  516. void CSpuGcm::OnSpuDrawQueueFlushInZPass()
  517. {
  518. //
  519. // flush watermark has changed now (it changes on every collect())
  520. // override flush watermark to flush before we reach ZPass cursor,
  521. // and if it's impossible, then Abort ZPass - we don't have enough space
  522. // in SPU GCM buffer
  523. //
  524. // Take care not to flush excessively when pusing the last few commands into
  525. // SPUGCM draw buffer because we can be doing that right around flush watermark
  526. // frequently
  527. //
  528. uint32 * pOldFlushWatermark = GetDrawQueue()->GetFlushWatermark();
  529. GcmStateFlush();
  530. uint32 * pNewFlushWatermark = GetDrawQueue()->GetFlushWatermark();
  531. if( pNewFlushWatermark < pOldFlushWatermark ? pNewFlushWatermark >= m_zPass.m_pCursor || pOldFlushWatermark <= m_zPass.m_pCursor : pOldFlushWatermark <= m_zPass.m_pCursor && m_zPass.m_pCursor <= pNewFlushWatermark )
  532. {
  533. // the next flush will be too late;
  534. // NOTE: we can recover up to 32KB by adjusting the flush watermark here, but I have bigger fish to fry, so we'll just abort ZPass right now and here
  535. AbortZPass();
  536. }
  537. }
  538. void CSpuGcm::OnSpuDrawQueueFlushInZPass( SpuDrawQueue *pDrawQueue )
  539. {
  540. // TODO: check if cursor is intersected and potentially EndZPass()
  541. Assert( pDrawQueue == g_spuGcm.GetDrawQueue() );
  542. g_spuGcm.OnSpuDrawQueueFlushInZPass();
  543. }
  544. void SpuGcmCommandBufferFlush()
  545. {
  546. g_spuGcm.CmdBufferFlush();
  547. }
  548. SpuDrawHeader_t * CSpuGcm::BeginDrawBatch()
  549. {
  550. SpuDrawHeader_t * pDrawHeader;
  551. if( IsDeferredDrawQueue() )
  552. {
  553. uintp eaSpuDrawHeader = ( uintp ) OpenDeferredChunk( SPUDRAWQUEUE_DEFERRED_DRAW_METHOD, 3 + ( sizeof( SpuDrawHeader_t ) + sizeof( IDirect3DVertexDeclaration9 * /*pVertDecl*/ ) ) / sizeof( uint32 ) );
  554. pDrawHeader = ( SpuDrawHeader_t * ) AlignValue( eaSpuDrawHeader, 16 );
  555. }
  556. else
  557. {
  558. GcmStateFlush();
  559. // we must be in the default batch transaction, and it must be empty so that we can switch the transaction type
  560. Assert( m_nCurrentBatch == BATCH_GCMSTATE && GetCurrentBatchCursor() == GetDrawQueue()->GetCursor() );
  561. pDrawHeader = GetDrawQueue()->AllocAligned<SpuDrawHeader_t>();
  562. }
  563. m_nCurrentBatch = BATCH_DRAW;
  564. Assert( GetDrawQueue()->IsValidCursor( (uint32*)( pDrawHeader + 1 ) ) );
  565. SetCurrentBatchCursor( ( uint32* ) pDrawHeader );
  566. return pDrawHeader;
  567. }
  568. CellSpursJob128 * CSpuGcm::PushDrawBatchJob( uint nResultantSpuDrawQueueSignal, SpuDrawHeader_t * pDrawHeader, IDirect3DVertexDeclaration9 *pVertDecl, OptimizedModel::OptimizedIndexBufferMarkupPs3_t *pIbMarkup )
  569. {
  570. CellSpursJob128 * pJob = m_jobPool128.Alloc( *m_pRoot->m_pDrawIndexedPrimitive );
  571. pJob->header.useInOutBuffer = 1;
  572. // we'll DMA get textures and layouts inside the job; we'll need space for DMA elements to do so
  573. pJob->header.sizeScratch = AlignValue( sizeof( JobDrawIndexedPrimitiveScratch_t ), 128 ) / 16;
  574. CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
  575. dmaConstructor.AddInputDma( sizeof( g_spuGcmShared ), &g_spuGcmShared ); // dma[0]; must be the first to be 128-byte aligned for atomics
  576. dmaConstructor.AddInputDma( sizeof( *pVertDecl ), pVertDecl ); // dma[1]
  577. dmaConstructor.AddInputDma( sizeof( *pDrawHeader ), pDrawHeader ); // dma[2]
  578. COMPILE_TIME_ASSERT( sizeof( g_spuGcmShared ) < 16 * 1024 && sizeof( *pVertDecl ) < 16 * 1024 && sizeof( *pDrawHeader ) < 16 * 1024 );
  579. // pIbMarkup = pDrawHeader->m_eaIbMarkup;
  580. if ( pIbMarkup )
  581. {
  582. uint nIbMarkupBytes = ( pIbMarkup->m_numPartitions * sizeof( OptimizedModel::OptimizedIndexBufferMarkupPs3_t::Partition_t ) + sizeof( OptimizedModel::OptimizedIndexBufferMarkupPs3_t ) );
  583. dmaConstructor.AddInputDma( ( nIbMarkupBytes + 31 ) & -16, ( const void* )( uintp( pIbMarkup ) & -16 ) ); // dma[3]
  584. }
  585. //dmaConstructor.AddInputDmaLarge( SPUGCM_LSRING_SIZE, nUsefulBytesAligned, PCB_RING_CTX.begin ); // dma[4,5,6,7]
  586. dmaConstructor.AddSizeInOrInOut( SPUGCM_LSRING_SIZE );
  587. COMPILE_TIME_ASSERT( SPUGCM_LSRING_SIZE / (16*1024) <= 4 );
  588. // usage of the IO buffer slack:
  589. // alignment, sync signal, wrap sequence, alignment, RSX PUT control register output, SPURS job command output
  590. dmaConstructor.AddSizeInOrInOut(
  591. 128 // potential misalignment of command buffer, for double-bandwidth DMA to command buffer (not used now)
  592. + sizeof( SysringWrapSequence ) // is it accounted for in the LSRING_SLACK?
  593. + 16 // lsResetDrawBatch
  594. + 16 // lsTempRsxPut
  595. + 16 // g_lsDummyRead
  596. );
  597. COMPILE_TIME_ASSERT( sizeof( CPs3gcmTextureLayout::Format_t ) == 16 );
  598. dmaConstructor.AddCacheDma( g_nPs3texFormatCount * sizeof( CPs3gcmTextureLayout::Format_t ), g_ps3texFormats ) ; // dma[8]
  599. dmaConstructor.FinishIoBuffer( &pJob->header );
  600. pJob->header.sizeStack = 16 * 1024 / 16;
  601. pDrawHeader->m_nPs3texFormatCount = g_nPs3texFormatCount; // for reference; is not strictly needed here
  602. pDrawHeader->m_nUsefulCmdBytes = 0;//nUsefulBytes;
  603. pDrawHeader->m_nPcbringBegin = 0;//m_nPcbringBegin; // note: this is the post-updated buffer counter!
  604. pDrawHeader->m_nResultantSpuDrawGet = nResultantSpuDrawQueueSignal;
  605. #ifdef DBGFLAG_ASSERT
  606. SpuDrawQueue * pSignalDrawQueue = &m_spuDrawQueues[ nResultantSpuDrawQueueSignal & 3 ? 1 : 0 ];(void)pSignalDrawQueue;
  607. Assert( pSignalDrawQueue->IsValidCursor( (uint32*)( nResultantSpuDrawQueueSignal & ~3 ) ) );
  608. #endif
  609. uint nResultantSpuDrawQueueIndex = nResultantSpuDrawQueueSignal & 3;
  610. m_pDeferredChunkSubmittedTill[ nResultantSpuDrawQueueIndex ] = ( uint32* )( nResultantSpuDrawQueueSignal & ~3 );
  611. Assert( CELL_OK == cellSpursCheckJob( (const CellSpursJob256 *)pJob, sizeof( *pJob ), 256 ) );
  612. m_eaLastJobThatUpdatesSharedState = ( uintp )pJob;
  613. //PCB_RING_CTX.begin = PCB_RING_CTX.current = pSkipTo; // submitted; now when needed, we'll wait for SPU to reply through shared state
  614. //Assert( PCB_RING_CTX.begin <= PCB_RING_CTX.end );
  615. PushSpuGcmJob( pJob );
  616. // after this job runs, it spawns FPCP job, which will advance the FPCP state
  617. m_nFpcpStateEndOfJournalIdxAtSpuGcmJob = g_pixelShaderPatcher.GetStateEndOfJournalIdx();
  618. if( SPUGCM_DEBUG_MODE )
  619. {
  620. // in SPUGCM_DEBUG_MODE, we execute all jobs and wait for them to complete. So, the GET pointer should always trail our pNext pointer
  621. Assert( g_spuGcmShared.m_nSpuDrawGet[ nResultantSpuDrawQueueIndex ] == ( nResultantSpuDrawQueueSignal & ~3 ) );
  622. }
  623. return pJob;
  624. }
  625. // BUG: pVertDecl may be released right after this call, we need to copy it somewhere or addref
  626. void CSpuGcm::SubmitDrawBatch( IDirect3DVertexDeclaration9 *pVertDecl, OptimizedModel::OptimizedIndexBufferMarkupPs3_t *pIbMarkup )
  627. {
  628. Assert( m_nCurrentBatch == BATCH_DRAW );
  629. SpuDrawHeader_t * pDrawHeader = ( SpuDrawHeader_t * )GetCurrentBatchCursor();
  630. if ( pIbMarkup )
  631. {
  632. Assert( pIbMarkup->kHeaderCookie == pIbMarkup->m_uiHeaderCookie );
  633. // real markup exists in this index buffer
  634. pDrawHeader->m_eaIbMarkup = pIbMarkup;
  635. pDrawHeader->m_nIbMarkupPartitions = pIbMarkup->m_numPartitions;
  636. }
  637. else
  638. {
  639. pDrawHeader->m_eaIbMarkup = NULL;
  640. pDrawHeader->m_nIbMarkupPartitions = 0;
  641. }
  642. if( IsDeferredDrawQueue() )
  643. {
  644. *( ( IDirect3DVertexDeclaration9 ** )( pDrawHeader + 1 ) ) = pVertDecl;
  645. OpenDeferredChunk();
  646. m_nCurrentBatch = BATCH_GCMSTATE;
  647. ValidateDeferredQueue();
  648. }
  649. else
  650. {
  651. PushDrawBatchJob( GetDrawQueue()->GetSignal(), pDrawHeader, pVertDecl, pIbMarkup );
  652. BeginGcmStateTransaction();
  653. ZPassCheckpoint( 8 );
  654. if ( SPUGCM_DEBUG_MODE )
  655. {
  656. GCM_FUNC( cellGcmSetWriteBackEndLabel, GCM_LABEL_DEBUG0, (uint)pDrawHeader );
  657. CmdBufferFinish();
  658. volatile uint32 * pDebugLabel = cellGcmGetLabelAddress( GCM_LABEL_DEBUG0 );
  659. while( *pDebugLabel != ( uint ) pDrawHeader )
  660. {
  661. // this may happen due to latency , but it won't be an infinite loop
  662. //Msg( "Hmmmm... WriteLabel; Finish(); but label isn't set yet! 0x%X != 0x%X\n", *pDebugLabel, (uint)pDrawHeader );
  663. continue;
  664. }
  665. }
  666. }
  667. }
  668. bool ZPass::CanBegin( )
  669. {
  670. if( m_pCursor )
  671. {
  672. return false; // already begun
  673. }
  674. // we need at least some memory to store the job descriptor pointers
  675. if( GetSubchainCapacity( ) < 32 )
  676. {
  677. Warning( "Cannot begin ZPass: zpass job subchain buffer is full\n" );
  678. return false;
  679. }
  680. // we need a buffer in spuDrawQueue to store "ZPass begin, switch, end" commands
  681. // we may potentially need the space to store the whole state before ZPass, too
  682. return true;
  683. }
  684. void ZPass::Begin( uint32 * pCursor )
  685. {
  686. m_pCursor = pCursor;
  687. m_nDrawPassSubchain = m_nPut;
  688. m_pSubchain = GetCurrentCommandPtr();
  689. *m_pSubchain = CELL_SPURS_JOB_COMMAND_JTS;
  690. m_nFpcpStateEndOfJournalIdxAtZPassBegin = g_pixelShaderPatcher.GetStateEndOfJournalIdx();
  691. }
  692. void ZPass::PushCommand( uint64 nCommand )
  693. {
  694. Validate();
  695. Assert( GetSubchainCapacity() > 2 );
  696. uint64 * pLwsync = GetCurrentCommandPtr();
  697. m_nPut++;
  698. uint64 * pCommand = GetCurrentCommandPtr();
  699. m_nPut++;
  700. uint64 * pJts = GetCurrentCommandPtr();
  701. Validate();
  702. *pJts = CELL_SPURS_JOB_COMMAND_JTS;
  703. *pCommand = nCommand;
  704. __lwsync();
  705. *pLwsync = CELL_SPURS_JOB_COMMAND_LWSYNC; // release the previous JTS
  706. }
  707. bool CSpuGcm::BeginZPass( )
  708. {
  709. if( !IsDeferredDrawQueue() && m_zPass.CanBegin() )
  710. {
  711. // debug - do not checkin
  712. // while( g_pixelShaderPatcher.GetJournalSpaceLeftSince( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync ) > 20 )
  713. // {
  714. // g_pixelShaderPatcher.SetFragmentRegisterBlock(95, 1, (const float*)&g_spuGcmShared.m_eaFpcpSharedState->m_reg[95] );
  715. // }
  716. if( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob != g_pixelShaderPatcher.GetStateEndOfJournalIdx() )
  717. {
  718. GetDrawQueue()->Push2( SPUDRAWQUEUE_FLUSH_FPCP_JOURNAL, g_pixelShaderPatcher.GetStateEndOfJournalIdx() );
  719. }
  720. // this is where we start commands that we'll need to replay
  721. uint32 * pCursorBegin = GetDrawQueue()->GetCursor();
  722. uint nSafetyBufferWords = 4 ; // buffer so that when we come around, we can insert EndZPostPass method command (at least 3 words)
  723. uint nCommandWords = 2 // command : the command and EA of ZPassSavedState_t
  724. + nSafetyBufferWords
  725. + 4 // alignment buffer for ZPassSavedState_t
  726. + sizeof( ZPassSavedState_t );
  727. m_zPass.m_nJobPoolMarker = m_jobPool128.GetMarker();
  728. uint32 * pCmdBeginZPrepass = GetDrawQueue()->AllocWords( nCommandWords );
  729. pCmdBeginZPrepass[0] = SPUDRAWQUEUE_BEGINZPREPASS_METHOD | ( SPUDRAWQUEUE_BEGINZPREPASS_MASK & nCommandWords );
  730. ZPassSavedState_t * pSavedState = ( ZPassSavedState_t * )AlignValue( uintp( pCmdBeginZPrepass + 2 + nSafetyBufferWords ), 16 );
  731. pCmdBeginZPrepass[1] = ( uintp )pSavedState;
  732. m_zPass.m_pSavedState = pSavedState;
  733. //
  734. // WARNING.
  735. //
  736. // SPUDRAWQUEUE_BEGINZPREPASS_METHOD must be the last method that modifies g_spuGcmShared.m_dxGcmState in a job_gcmflush SpuDrawQueue.
  737. // This is because its implementation doesn't wait for DMA put to finish.
  738. //
  739. GCM_PERF_PUSH_MARKER( "ZPass_Z" );
  740. CmdBufferFlush();
  741. // actually begin; don't let anyone overwrite the commands after cursor
  742. m_zPass.Begin( pCursorBegin );
  743. GetDrawQueue()->PushFlushCallback( OnSpuDrawQueueFlushInZPass );
  744. PushSpuGcmCallSubchain( m_zPass.m_pSubchain ); // call all those SPUGCM jobs for the first time
  745. return true;
  746. }
  747. else
  748. return false;
  749. }
  750. void CSpuGcm::SetPredication( uint nPredicationMask ) // D3DPRED_* mask
  751. {
  752. uint32 * pCmd = GetDrawQueue()->AllocWords( 1 );
  753. *pCmd = SPUDRAWQUEUE_PREDICATION_METHOD | ( SPUDRAWQUEUE_PREDICATION_MASK & nPredicationMask );
  754. }
  755. void CSpuGcm::EndZPass( bool bPopMarker )
  756. {
  757. if( m_zPass && !m_zPass.m_isInEndZPass )
  758. {
  759. m_zPass.m_isInEndZPass = 1;
  760. GetDrawQueue()->PopFlushCallback();
  761. // as a precaution, since we don't need watermark-flush callbacks for the duration of this function, we'll disable it to avoid recursive flushes
  762. GetDrawQueue()->PushFlushCallback( OnSpuDrawQueueFlushDoNothing );
  763. // flush whatever state we may have.. it's not really needed to replay it twice, but whatever. we do need to replay it the 2nd time, and we can't just skip on it easily now in the 1st pass
  764. CmdBufferFlush();
  765. m_zPass.PushCommand( CELL_SPURS_JOB_COMMAND_RET );
  766. m_zPass.End(); // at this point, there's no more "Z prepass". There's just a bunch of SPUGCM commands waiting to be executed
  767. // replay from cursor
  768. uint32 * pCmdEndZPrepass = GetDrawQueue()->AllocWords( 2 );
  769. //m_nGcmFlushJobScratchSize = MAX( m_nGcmFlushJobScratchSize, CELL_GCM_VTXPRG_MAX_CONST );
  770. pCmdEndZPrepass[0] = SPUDRAWQUEUE_ENDZPREPASS_METHOD;
  771. pCmdEndZPrepass[1] = ( uintp )m_zPass.m_pSavedState;
  772. if( bPopMarker )
  773. {
  774. GCM_PERF_POP_MARKER( /*"ZPass_Z"*/ );
  775. GCM_PERF_MARKER( "ZPass_ZEnd" );
  776. }
  777. else
  778. {
  779. GCM_PERF_MARKER( "ZPass_Abort" );
  780. }
  781. CmdBufferFlush(); // commit the "End Z Prepass" command. NOTE: we don't want to commit it twice, so we End ZPass BEFORE we commit this command
  782. // even though Z Prepass is ended now, all those commands and their memory are still intact
  783. // re-execute them here now
  784. PushSpuGcmCallSubchain( m_zPass.m_pSubchain ); // call all those SPUGCM jobs again!
  785. GetDrawQueue()->PopFlushCallback();
  786. // SPUGCM ring release point: after this point, we can simply wait for more space to become available in SPUGCM draw command ring
  787. // Do we need to really end the render pass?
  788. // Hopefully not, because hopefully it'll just organically be indistinguishable from the non-Z-prepassed rendering
  789. uint32 * pCmdEndZPostPass = GetDrawQueue()->AllocWords( 3 );
  790. pCmdEndZPostPass[0] = SPUDRAWQUEUE_ENDZPOSTPASS_METHOD;
  791. pCmdEndZPostPass[1] = m_zPass.m_nPut;
  792. pCmdEndZPostPass[2] = (uintp)&m_zPass.m_nGet;
  793. GCM_PERF_MARKER( bPopMarker ? "ZPass_RenderEnd" : "AbortedZPass_RenderEnd" );
  794. CmdBufferFlush();
  795. m_zPass.m_isInEndZPass = 0;
  796. }
  797. else
  798. {
  799. if( bPopMarker )
  800. {
  801. GCM_PERF_POP_MARKER( );
  802. }
  803. }
  804. }
  805. void ZPass::Init()
  806. {
  807. m_nDummy = 0;
  808. m_pCursor = NULL;
  809. m_nJobs = 2048;
  810. m_pJobs = (uint64*)MemAlloc_AllocAligned( ( m_nJobs + 1 )* sizeof( uint64 ), 16 );
  811. m_pJobs[m_nJobs] = CELL_SPURS_JOB_COMMAND_NEXT( m_pJobs );
  812. m_nGet = 0;
  813. m_nPut = 0;
  814. m_isInEndZPass = 0;
  815. }
  816. void ZPass::Shutdown()
  817. {
  818. MemAlloc_FreeAligned( m_pJobs );
  819. }
  820. //#endif
  821. uint g_nEdgeJobChainMaxContention = 5;
  822. void CSpuGcm::OnVjobsInit()
  823. {
  824. int nJobPoolCount = Max<uint>( 256, g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment() * 4 );
  825. int nCmdLineJobPoolCount = CommandLine()->ParmValue( "-spugcmJobPool", nJobPoolCount );
  826. if( nCmdLineJobPoolCount > nJobPoolCount && !( nCmdLineJobPoolCount & ( nCmdLineJobPoolCount - 1 ) ) )
  827. {
  828. Msg("Increasing spugcm cjob pool count from %d to %d\n", nJobPoolCount, nCmdLineJobPoolCount );
  829. nJobPoolCount = nCmdLineJobPoolCount;
  830. }
  831. // priority lower than the main job queue, in order to yield
  832. if( int nError = m_jobSink.Init( m_pRoot, 1, nJobPoolCount, ( uint8_t* )&m_pRoot->m_nSpugcmChainPriority, "spugcm", DMATAG_GCM_JOBCHAIN ) )
  833. {
  834. Error( "Cannot init SpuGcm, cell error %d\n", nError );
  835. }
  836. COMPILE_TIME_ASSERT( sizeof( job_edgegeom::JobDescriptor_t ) == 512 );
  837. if( int nError = g_spuGcmShared.m_edgeJobChain.Init( m_pRoot, g_nEdgeJobChainMaxContention, 128, ( uint8_t* )&m_pRoot->m_nEdgeChainPriority, sizeof( job_edgegeom::JobDescriptor_t ), CELL_SPURS_JOBQUEUE_DEFAULT_MAX_GRAB, "edge", DMATAG_EDGE_JOBCHAIN ) )
  838. {
  839. Error(" Cannot init SpuGcm, edge jobchain, error %d\n", nError );
  840. }
  841. if( int nError = g_spuGcmShared.m_fpcpJobChain.Init( m_pRoot, 1, 512, ( uint8_t* )&m_pRoot->m_nFpcpChainPriority, 128, CELL_SPURS_JOBQUEUE_DEFAULT_MAX_GRAB, "fpcp", DMATAG_FPCP_JOBCHAIN ) )
  842. {
  843. Error(" Cannot init SpuGcm, fpcp jobchain, error %d\n", nError );
  844. }
  845. if( nJobPoolCount < g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment() * 4 ) // we need at least this much to avoid at least most stalls
  846. {
  847. Error( "Job pool count %d is too small! With %d jobs per segment, make it at least %d\n", nJobPoolCount, g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment(), g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment() * 4 );
  848. }
  849. m_jobPool128.Init( nJobPoolCount );
  850. g_spuGcmShared.m_jobPoolEdgeGeom.Init( 128 );
  851. g_spuGcmShared.m_jobFpcPatch2 = *( m_pRoot->m_pFpcPatch2 );
  852. g_spuGcmShared.m_jobEdgeGeom = *( m_pRoot->m_pEdgeGeom );
  853. if( m_pMlaaBuffer )
  854. {
  855. g_edgePostWorkload.OnVjobsInit( m_pRoot );
  856. }
  857. }
  858. #if 0 // priorities test
  859. bool PriorityTest_t::Test( class VjobChain4 *pJobChain )
  860. {
  861. m_notify.m_nCopyFrom = 1;
  862. m_notify.m_nCopyTo = 0;
  863. uint nTick0 = __mftb();
  864. pJobChain->Run();
  865. uint nTick1 = __mftb();
  866. *( pJobChain->Push() ) = CELL_SPURS_JOB_COMMAND_JOB( &m_job );
  867. uint nTick2 = __mftb(), nTick3;
  868. do
  869. {
  870. nTick3 = __mftb();
  871. if( nTick3 - nTick2 > 79800000 * 5 )
  872. {
  873. Msg("%s:HANG\n", pJobChain->GetName());
  874. return false;
  875. }
  876. }
  877. while( !*(volatile uint32*)&m_notify.m_nCopyTo );
  878. Msg("%s[%d]:%5.0f+%5.0f(run=%5.0f)\n", pJobChain->GetName(), m_notify.m_nSpuId, (nTick2-nTick1)*40.1f, (nTick3-nTick2)*40.1f, (nTick1 - nTick0) * 40.1f );
  879. return true;
  880. }
  881. void CSpuGcm::TestPriorities()
  882. {
  883. PriorityTest_t * pTest = (PriorityTest_t*)MemAlloc_AllocAligned( sizeof( PriorityTest_t ), 128 );
  884. V_memset( &pTest->m_job, 0, sizeof( pTest->m_job ) );
  885. pTest->m_job.header = *(m_pRoot->m_pJobNotify);
  886. pTest->m_job.header.useInOutBuffer = 1;
  887. AddInputDma( &pTest->m_job, sizeof( pTest->m_notify ), &pTest->m_notify );
  888. pTest->m_job.workArea.userData[1] = 0; // function: default
  889. for( uint i = 0; i < 50; ++ i)
  890. {
  891. if( !pTest->Test( &g_spuGcmShared.m_edgeJobChain ) )
  892. return ; // leak
  893. if( ! pTest->Test( &g_spuGcmShared.m_fpcpJobChain ) )
  894. return ; // leak
  895. }
  896. MemAlloc_FreeAligned( pTest );
  897. }
  898. #endif
  899. void CSpuGcm::OnVjobsShutdown() // gets called before m_pRoot is about to be destructed and NULL'ed
  900. {
  901. CmdBufferFinish();
  902. g_edgePostWorkload.OnVjobsShutdown( m_pRoot );
  903. // in case of priority issues with job chains (when experimenting with reload_vjobs), let's first end and then join all workloads
  904. m_jobSink.End();
  905. g_spuGcmShared.m_fpcpJobChain.End();
  906. g_spuGcmShared.m_edgeJobChain.End();
  907. m_jobSink.Join();
  908. g_spuGcmShared.m_fpcpJobChain.Join();
  909. g_spuGcmShared.m_edgeJobChain.Join();
  910. m_jobPool128.Shutdown();
  911. g_spuGcmShared.m_jobPoolEdgeGeom.Shutdown();
  912. }
  913. void CSpuGcm::Shutdown()
  914. {
  915. g_pVJobs->Unregister( this ); // note: this will also call VjobsShutdown, which will join all SPU workloads and effectively call CmdBufferFinish();
  916. g_edgeGeomRing.Shutdown();
  917. if( m_pPcbringBuffer )
  918. {
  919. MemAlloc_FreeAligned( m_pPcbringBuffer );
  920. }
  921. m_spuDrawQueues[1].Shutdown();
  922. m_spuDrawQueues[0].Shutdown();
  923. #if SPU_GCM_DEBUG_TRACE
  924. MemAlloc_FreeAligned( g_spuGcmShared.m_eaDebugTraceBuffer );
  925. #endif
  926. m_zPass.Shutdown();
  927. for( uint i = 0; i < ARRAYSIZE( m_pDeferredStates ); ++i )
  928. {
  929. g_ps3gcmGlobalState.IoSlackFree( m_pDeferredStates[i] );
  930. }
  931. }
  932. void CSpuGcm::BeginScene()
  933. {
  934. DrawQueueNormal();
  935. if( m_nFramesToDisableDeferredQueue > 0 )
  936. {
  937. m_nFramesToDisableDeferredQueue-- ;
  938. }
  939. }
  940. void CSpuGcm::EndScene()
  941. {
  942. g_aici.m_nCpuActivityMask = g_edgeGeomRing.m_nUsedSpus;
  943. g_edgeGeomRing.m_nUsedSpus = 0;
  944. g_aici.m_nDeferredWordsAllocated = m_spuDrawQueues[1].m_nAllocWords - m_nDeferredQueueWords;
  945. m_nDeferredQueueWords = m_spuDrawQueues[1].m_nAllocWords;
  946. if( m_zPass )
  947. {
  948. ExecuteNTimes( 100, Warning( "SpuGcm:EndScene must Abort ZPass; mismatched BeginZPass/EndZPass\n" ) );
  949. AbortZPass();
  950. }
  951. if( g_spuGcmShared.m_enableStallWarnings )
  952. {
  953. if( m_jobPool128.m_nWaitSpins > 100 )
  954. {
  955. if( g_spuGcmShared.m_enableStallWarnings )
  956. {
  957. Warning( "SpuGcm: %d spins in job pool, PPU is really ahead of SPU and (probably) RSX.\n", m_jobPool128.m_nWaitSpins );
  958. }
  959. }
  960. m_jobPool128.m_nWaitSpins = 0;
  961. /*
  962. if( m_jobPool256.m_nWaitSpins )
  963. {
  964. if( g_spuGcmShared.m_enableStallWarnings )
  965. {
  966. Warning( "SpuGcm: %d spins in job pool 256, PPU is really ahead of SPU and (probably) RSX.\n", m_jobPool256.m_nWaitSpins );
  967. }
  968. m_jobPool256.m_nWaitSpins = 0;
  969. }
  970. */
  971. if( m_nPcbringWaitSpins > 100 )
  972. {
  973. if( g_spuGcmShared.m_enableStallWarnings )
  974. {
  975. Warning( "SpuGcm: %d spins in PcbRing, PPU is waiting for SPU (possibly) waiting for RSX\n", m_nPcbringWaitSpins );
  976. }
  977. }
  978. m_nPcbringWaitSpins = 0;
  979. }
  980. m_nFrame++;
  981. COMPILE_TIME_ASSERT( ARRAYSIZE( m_pDeferredStates ) == 2 ); // we need to rotate the array if it's not 2-element
  982. Swap( m_pDeferredStates[0], m_pDeferredStates[1] );
  983. extern ConVar r_ps3_mlaa;
  984. m_bUseDeferredDrawQueue = m_pMlaaBuffer && !( r_ps3_mlaa.GetInt() & 16 );
  985. }
  986. void CSpuGcm::CmdBufferFinish()
  987. {
  988. #ifdef CELL_GCM_DEBUG // [
  989. extern void (*fnSaveCellGcmDebugCallback)(struct CellGcmContextData*) = gCellGcmDebugCallback;
  990. gCellGcmDebugCallback = NULL; // disable recursive callback
  991. #endif // ]
  992. s_nFinishLabelValue++;
  993. GCM_FUNC( cellGcmSetWriteBackEndLabel, GCM_LABEL_SPUGCM_FINISH, s_nFinishLabelValue );
  994. CmdBufferFlush();
  995. Assert( s_nStopAtFinishLabelValue != s_nFinishLabelValue );
  996. // now wait for RSX to reach
  997. uint nSpins = 0;
  998. uint nTbStart = __mftb();
  999. volatile uint32 * pLastJobUpdate = &g_spuGcmShared.m_eaLastJobThatUpdatedMe;
  1000. while( ( s_nFinishLabelValue != *m_pFinishLabel ) ||
  1001. ( *pLastJobUpdate != m_eaLastJobThatUpdatesSharedState ) )
  1002. {
  1003. sys_timer_usleep( 30 ); // don't hog the PPU
  1004. ++nSpins;
  1005. #ifndef _CERT
  1006. if( nSpins && ( nSpins % 100000 == 0 ) )
  1007. {
  1008. Warning(
  1009. "** SpuGcm detected an SPU/RSX hang. **\n"
  1010. );
  1011. }
  1012. #endif
  1013. }
  1014. uint nTbEnd = __mftb();
  1015. if( nSpins > 1000 )
  1016. {
  1017. Warning( "Long wait (%d us / %d spins) in CmdBufferFinish()\n", ( nTbEnd - nTbStart ) / 80, nSpins );
  1018. }
  1019. #ifdef CELL_GCM_DEBUG // [
  1020. gCellGcmDebugCallback = fnSaveCellGcmDebugCallback;
  1021. #endif // ]
  1022. }
  1023. void CSpuGcm::SyncMlaa( void * pLocalSurface )
  1024. {
  1025. uint nInSurfaceOffset = ( g_ps3gcmGlobalState.m_nRenderSize[1]/2 * g_ps3gcmGlobalState.m_nSurfaceRenderPitch ) & -16;
  1026. vec_int4 * pIn = ( vec_int4 * )( ( uintp( m_pMlaaBuffer ) + nInSurfaceOffset ) ), *pOut = ( vec_int4 * ) ( uintp( pLocalSurface ) + nInSurfaceOffset );
  1027. uint nRowWidth = g_ps3gcmGlobalState.m_nSurfaceRenderPitch/64, nExclude = ( m_nFrame % ( nRowWidth - 2 ) ) + 1;
  1028. for( uint nRow = 0; nRow < 4; ++nRow )
  1029. {
  1030. vec_int4 * pRowIn = AddBytes( pIn, g_ps3gcmGlobalState.m_nSurfaceRenderPitch * nRow );
  1031. vec_int4 * pRowOut = AddBytes( pOut, g_ps3gcmGlobalState.m_nSurfaceRenderPitch * nRow );
  1032. for( uint i = 0; i < nExclude; i ++ )
  1033. {
  1034. vec_int4 *input = pRowIn + i * 4, *output = pRowOut + i * 4;
  1035. output[0] = vec_nor( input[0], input[0] );
  1036. output[1] = vec_nor( input[1], input[1] );
  1037. output[2] = vec_nor( input[2], input[2] );
  1038. output[3] = vec_nor( input[3], input[3] );
  1039. }
  1040. for( uint i = nExclude + 1; i < nRowWidth ; ++i )
  1041. {
  1042. vec_int4 *input = pRowIn + i*4, *output = pRowOut + i*4;
  1043. output[0] = vec_nor( input[0], input[0] );
  1044. output[1] = vec_nor( input[1], input[1] );
  1045. output[2] = vec_nor( input[2], input[2] );
  1046. output[3] = vec_nor( input[3], input[3] );
  1047. }
  1048. }
  1049. }
  1050. void CSpuGcm::CloseDeferredChunk()
  1051. {
  1052. Assert( m_nSpuDrawQueueSelector == 1 );
  1053. uint32 * pDeferredQueueCursor = m_spuDrawQueues[1].GetCursor();
  1054. if( m_pDeferredChunkHead )
  1055. {
  1056. #ifdef _DEBUG
  1057. m_nChunksClosedInSegment++;
  1058. #endif
  1059. // mark the previous chunk with its end
  1060. m_pDeferredChunkHead[1] = ( uint32 )pDeferredQueueCursor;
  1061. m_pDeferredChunkHead = NULL;
  1062. }
  1063. m_pDeferredQueueCursors[0] = pDeferredQueueCursor;
  1064. ValidateDeferredQueue();
  1065. }
  1066. #if SPUGCM_DEBUG_MODE
  1067. uint g_nDeferredChunks[0x800][4], g_nDeferredChunkCount = 0;
  1068. #endif
  1069. uint32* CSpuGcm::OpenDeferredChunk( uint nHeader, uint nAllocExtra )
  1070. {
  1071. Assert( IsValidDeferredHeader( nHeader ) );
  1072. Assert( m_nSpuDrawQueueSelector == 1 );
  1073. // skip allocation of the new chunk if the current chunk is empty
  1074. if( !m_pDeferredChunkHead || m_pDeferredChunkHead + SPUDRAWQUEUE_DEFERRED_HEADER_WORDS != GetDrawQueue()->GetCursor() || nAllocExtra > 0 )
  1075. {
  1076. // we don't have an empty chunk already; allocate more
  1077. CloseDeferredChunk();
  1078. m_pDeferredChunkHead = GetDrawQueue()->AllocWords( SPUDRAWQUEUE_DEFERRED_HEADER_WORDS + nAllocExtra );
  1079. }
  1080. m_pDeferredChunkHead[0] = nHeader; // just flush state by default
  1081. m_nDeferredChunkHead = nHeader;
  1082. m_pDeferredChunkHead[1] = ( uintp )GetDrawQueue()->GetCursor();
  1083. ValidateDeferredQueue();
  1084. #ifdef _DEBUG
  1085. if( SPUDRAWQUEUE_DEFERRED_HEADER_WORDS > 2 )
  1086. {
  1087. m_pDeferredChunkHead[2] = GetDrawQueue()->m_nAllocCount;
  1088. }
  1089. #endif
  1090. #if SPUGCM_DEBUG_MODE
  1091. uint nIdx = (g_nDeferredChunkCount++)%(ARRAYSIZE(g_nDeferredChunks));
  1092. Assert( nIdx < ARRAYSIZE(g_nDeferredChunks) );
  1093. uint * pDebug = g_nDeferredChunks[nIdx];
  1094. pDebug[0] = nHeader;
  1095. pDebug[1] = (uint32)m_pDeferredChunkHead;
  1096. pDebug[2] = nAllocExtra;
  1097. pDebug[3] = GetDrawQueue()->m_nAllocCount;
  1098. #endif
  1099. GetDrawQueue()->SetFlushWatermarkFrom( m_pDeferredChunkHead );
  1100. return m_pDeferredChunkHead + SPUDRAWQUEUE_DEFERRED_HEADER_WORDS;
  1101. }
  1102. void CSpuGcm::DrawQueueNormal( bool bExecuteDeferredQueueSegment )
  1103. {
  1104. if( m_nSpuDrawQueueSelector != 0 )
  1105. {
  1106. FillNops( &PCB_RING_CTX ); // switching draw queues, preallocated gcm context no longer usable
  1107. Assert( *m_pDeferredChunkHead == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD );
  1108. GetDrawQueue()->Push1( SPUDRAWQUEUE_PERF_MARKER_DrawNormal );
  1109. CloseDeferredChunk();
  1110. m_pDeferredQueueCursors[0] = m_spuDrawQueues[1].GetCursor();
  1111. /*uint nBytesInSegment = m_spuDrawQueues[1].Length( m_pDeferredQueueSegment, m_pDeferredQueueCursors[0] );
  1112. Msg( "DrawQueueNormal %p..%p=%.1fKB (%p,%p)\n", m_pDeferredQueueSegment, m_pDeferredQueueCursors[0],
  1113. nBytesInSegment / 1024.0f,
  1114. m_pDeferredQueueCursors[1], m_pDeferredQueueCursors[2] );*/
  1115. m_nSpuDrawQueueSelector = 0;
  1116. if( m_pDeferredQueueSegment && bExecuteDeferredQueueSegment )
  1117. {
  1118. ExecuteDeferredDrawQueueSegment( m_pDeferredQueueSegment, m_pDeferredQueueCursors[0], false );
  1119. m_pDeferredQueueSegment = NULL;
  1120. }
  1121. Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );
  1122. m_pDeferredChunkHead = NULL;
  1123. BeginGcmStateTransaction();
  1124. }
  1125. if( m_nFramesToDisableDeferredQueue > 0 )
  1126. {
  1127. ExecuteDeferredDrawQueue( 0 );
  1128. }
  1129. }
  1130. /*
  1131. void CSpuGcm::DisableMlaaForTwoFrames()
  1132. {
  1133. g_flipHandler.DisableMlaaForTwoFrames();
  1134. m_nFramesToDisableDeferredQueue = 2; // this frame and next will have disabled deferred queue
  1135. DrawQueueNormal();
  1136. }
  1137. */
  1138. void CSpuGcm::DisableMlaa()
  1139. {
  1140. DrawQueueNormal( false );
  1141. // we could, but we don't have to flush the previous frame:
  1142. // we'll do that at Flip, the same way we do it every time
  1143. g_flipHandler.DisableMlaa();
  1144. }
  1145. void CSpuGcm::DisableMlaaPermanently()
  1146. {
  1147. DrawQueueNormal( false );
  1148. g_flipHandler.QmsAdviceBeforeDrawPrevFramebuffer();
  1149. // flush previous frame first
  1150. ExecuteDeferredDrawQueue( 1 );
  1151. g_flipHandler.DisableMlaaPermannetly();
  1152. g_flipHandler.DisableMlaa();
  1153. extern void DxDeviceForceUpdateRenderTarget( );
  1154. DxDeviceForceUpdateRenderTarget( ); // recover main render target, as it was screwed up by execution of previous frame's commands
  1155. ExecuteDeferredDrawQueue( 0 );
  1156. }
  1157. CSpuGcm::DrawQueueDeferred_Result CSpuGcm::DrawQueueDeferred() // may flush previous frame deferred queue the first time
  1158. {
  1159. DrawQueueDeferred_Result result;
  1160. if( m_bUseDeferredDrawQueue && ( m_nFramesToDisableDeferredQueue == 0 ) && ( m_nSpuDrawQueueSelector != 1 ) )
  1161. {
  1162. FillNops( &PCB_RING_CTX ); // switching draw queues, preallocated gcm context no longer usable
  1163. // do we have anything in the deferred queue?
  1164. result.isFirstInFrame = m_pDeferredQueueCursors[0] == m_pDeferredQueueCursors[1];
  1165. GetDrawQueue()->Push1( SPUDRAWQUEUE_PERF_MARKER_DrawDeferred );
  1166. if( result.isFirstInFrame )
  1167. {
  1168. GetDrawQueue()->Push2( SPUDRAWQUEUE_DEFER_STATE, uintp( m_pDeferredStates[0] ) );
  1169. }
  1170. // before we dive into deferred queue, we flush the current queue, because we'll have to restart current queue when we dive out of deferred queue
  1171. // this will also make sure that any state dump required for deferred queue to execute will be dumped before deferred queue will try to execute
  1172. GcmStateFlush();
  1173. Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );
  1174. //ExecuteDeferredDrawQueue( 1 ); // dubious: we might want to execute this in the end of the frame to avoid undesirable state changes
  1175. m_nSpuDrawQueueSelector = 1;
  1176. BeginGcmStateTransaction();
  1177. m_pDeferredQueueSegment = m_spuDrawQueues[1].GetCursor();
  1178. #ifdef _DEBUG
  1179. m_nChunksClosedInSegment = 0;
  1180. #endif
  1181. //Msg( "DrawQueueDeferred %p / %.1f KB free...", m_pDeferredQueueSegment, m_spuDrawQueues[1].Length( m_pDeferredQueueSegment, m_spuDrawQueues[1].m_pGet ) );
  1182. OpenDeferredChunk( SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD );
  1183. if( result.isFirstInFrame ) // we defer the "UNDEFER" command in here
  1184. {
  1185. GetDrawQueue()->Push2( SPUDRAWQUEUE_UNDEFER_STATE, uintp( m_pDeferredStates[0] ) );
  1186. }
  1187. }
  1188. else
  1189. {
  1190. result.isFirstInFrame = false;
  1191. }
  1192. return result;
  1193. }
  1194. // returns: true if some memory will be freed up by SPU by poking into corresponding GET pointer later
  1195. bool CSpuGcm::ExecuteDeferredDrawQueue( uint nPrevious )
  1196. {
  1197. Assert( !IsDeferredDrawQueue() );
  1198. // just copy the commands to the main spugcm buffer
  1199. Assert( m_pDeferredQueueCursors[0] == m_spuDrawQueues[1].GetCursor() || m_pDeferredQueueCursors[0] == m_pDeferredChunkHead );
  1200. uint32 * pCmdEnd = m_pDeferredQueueCursors[nPrevious];//, *pCmdEnd = ( ( nPrevious == 0 ) ? m_spuDrawQueues[1].GetCursor() : m_pDeferredQueueCursors[ nPrevious - 1 ] );
  1201. uint32 * pCmdBegin = m_pDeferredQueueCursors[ARRAYSIZE(m_pDeferredQueueCursors)-1];
  1202. if( pCmdEnd == pCmdBegin )
  1203. return false;
  1204. //Msg( "ExecuteDeferredDrawQueue(%d) %p..%p=%.1fKB\n", nPrevious, pCmdBegin, pCmdEnd, m_spuDrawQueues[1].Length( pCmdBegin, pCmdEnd ) );
  1205. FillNops( &PCB_RING_CTX );
  1206. #if defined( _DEBUG ) && !defined( _CERT )
  1207. m_spuDrawQueues[0].Push1( SPUDRAWQUEUE_PERF_MARKER_AAReplay );
  1208. #endif
  1209. GcmStateFlush();
  1210. Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );// we're not deferred; so, GcmStateFlush calls BeginGcmStateTransaction that will reset the current batch cursor
  1211. bool bMoveGet = ExecuteDeferredDrawQueueSegment( pCmdBegin, pCmdEnd, true );
  1212. #if defined( _DEBUG ) && !defined( _CERT )
  1213. m_spuDrawQueues[0].Push1( SPUDRAWQUEUE_PERF_MARKER_AAReplayEnd );
  1214. SetCurrentBatchCursor( GetDrawQueue()->GetCursor() );
  1215. #endif
  1216. // forget about previously executed frames/chunks
  1217. for( uint i = nPrevious + 1; i < ARRAYSIZE( m_pDeferredQueueCursors ); ++i )
  1218. m_pDeferredQueueCursors[i] = pCmdEnd;
  1219. return bMoveGet;
  1220. }
  1221. bool CSpuGcm::ExecuteDeferredDrawQueueSegment( uint32 * pCmdBegin, uint32 * pCmdEnd, bool bExecuteDraws )
  1222. {
  1223. Assert( m_nCurrentBatch == BATCH_GCMSTATE );
  1224. // if we're in deferred queue, we should switch to normal queue before drawing from deferred to normal queue
  1225. Assert( !IsDeferredDrawQueue() );
  1226. bool bMoveGet = false;
  1227. uint nResultantSpuDrawQueueIndex = bExecuteDraws ? 1 : 2; // [2] is a dummy write-only resultant "GET" register..
  1228. #if SPUGCM_DEBUG_MODE
  1229. uint nDeferredChunkDebugIdx = 0xFFFFFFFF;
  1230. for( uint i = 1;i <= ARRAYSIZE( g_nDeferredChunks ); ++i )
  1231. {
  1232. uint j = ( g_nDeferredChunkCount - i ) & ( ARRAYSIZE( g_nDeferredChunks ) - 1 );
  1233. if( g_nDeferredChunks[j][1] == uintp( pCmdBegin ) )
  1234. {
  1235. nDeferredChunkDebugIdx = j;
  1236. break;
  1237. }
  1238. }
  1239. Assert( nDeferredChunkDebugIdx < ARRAYSIZE( g_nDeferredChunks ) );
  1240. #endif
  1241. SpuDrawQueue *pDrawQueue = &m_spuDrawQueues[1];
  1242. for( uint32 * pCmd = pDrawQueue->NormalizeCursor( pCmdBegin ), * pCmdNormalizedEnd = pDrawQueue->NormalizeCursor( pCmdEnd ), *pPrev = pCmd; pCmd != pCmdNormalizedEnd; )
  1243. {
  1244. if( !IsCert() && !pDrawQueue->IsValidCursor( pCmd ) )
  1245. DebuggerBreakIfDebugging();
  1246. uint nCmd = *pCmd;
  1247. if( nCmd == 0 )
  1248. {
  1249. pCmd++;
  1250. }
  1251. else if( ( nCmd & SPUDRAWQUEUE_METHOD_MASK ) == SPUDRAWQUEUE_NOPCOUNT_METHOD )
  1252. {
  1253. pCmd += 1 + ( nCmd & SPUDRAWQUEUE_NOPCOUNT_MASK );
  1254. }
  1255. else
  1256. {
  1257. uint32 * pNext = (uint32*)pCmd[1], *pCmdHeaderEnd = pCmd + SPUDRAWQUEUE_DEFERRED_HEADER_WORDS;
  1258. Assert( m_spuDrawQueues[1].IsValidCursor( pNext ) );
  1259. #if SPUGCM_DEBUG_MODE
  1260. for( uint i = 0; ; ++i )
  1261. {
  1262. uint j = ( nDeferredChunkDebugIdx + i ) & ( ARRAYSIZE( g_nDeferredChunks ) - 1 );
  1263. if( g_nDeferredChunks[j][1] == uintp( pCmd ) )
  1264. {
  1265. nDeferredChunkDebugIdx = j;
  1266. break;
  1267. }
  1268. if( i >= ARRAYSIZE( g_nDeferredChunks ) ) // stop if we don't find the debug idx
  1269. {
  1270. DebuggerBreak();
  1271. break;
  1272. }
  1273. }
  1274. #endif
  1275. switch ( nCmd & SPUDRAWQUEUE_DEFERRED_METHOD_MASK )
  1276. {
  1277. case SPUDRAWQUEUE_DEFERRED_SET_FP_CONST_METHOD:
  1278. {
  1279. uint nStartRegister = ( nCmd >> 12 ) & 0xFFF, nRegisterCount = nCmd & 0xFFF;
  1280. Assert( nStartRegister < 96 && nRegisterCount <= 96 );
  1281. OnSetPixelShaderConstant();
  1282. g_pixelShaderPatcher.SetFragmentRegisterBlock( nStartRegister, nRegisterCount, ( const float* )pCmdHeaderEnd );
  1283. //m_dirtyCachesMask |= DxAbstractGcmState_t::kDirtyPxConstants;
  1284. }
  1285. break;
  1286. case SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD:
  1287. if( nCmd == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD || bExecuteDraws )
  1288. {
  1289. PushStateFlushJob( pDrawQueue, uint( pNext ) | nResultantSpuDrawQueueIndex, pCmdHeaderEnd, pNext );
  1290. Assert( m_pDeferredChunkSubmittedTill[nResultantSpuDrawQueueIndex] == pNext );
  1291. bMoveGet = true;
  1292. }
  1293. break;
  1294. case SPUDRAWQUEUE_DEFERRED_DRAW_METHOD:
  1295. if( bExecuteDraws )
  1296. {
  1297. Assert( nCmd == SPUDRAWQUEUE_DEFERRED_DRAW_METHOD );
  1298. SpuDrawHeader_t * pDrawHeader = ( SpuDrawHeader_t * )AlignValue( uintp( pCmdHeaderEnd ), 16 );
  1299. // at the time we set up these deferred calls, we don't track the FPCP journal, so we need to refresh the indices referring into it here
  1300. pDrawHeader->m_nFpcpEndOfJournalIdx = g_pixelShaderPatcher.GetStateEndOfJournalIdx();
  1301. CellSpursJob128 * pDrawJob = PushDrawBatchJob( uint( pNext ) | nResultantSpuDrawQueueIndex, pDrawHeader, *( IDirect3DVertexDeclaration9** )( pDrawHeader + 1 ), pDrawHeader->m_eaIbMarkup );
  1302. Assert( m_pDeferredChunkSubmittedTill[nResultantSpuDrawQueueIndex] == pNext );
  1303. bMoveGet = true;
  1304. }
  1305. break;
  1306. }
  1307. pPrev = pCmd;
  1308. pCmd = pNext;
  1309. }
  1310. pCmd = pDrawQueue->NormalizeCursor( pCmd );
  1311. }
  1312. return bMoveGet;
  1313. }
  1314. void CSpuGcm::FlipDeferredDrawQueue()
  1315. {
  1316. //Msg( "FlipDeferredDrawQueue {%p,%p,%p} Frame=%d\n", m_pDeferredQueueCursors[0], m_pDeferredQueueCursors[1], m_pDeferredQueueCursors[2], m_nFrame );
  1317. Assert( !IsDeferredDrawQueue() );
  1318. m_pDeferredQueueCursors[0] = m_spuDrawQueues[1].GetCursor();
  1319. for( uint i = ARRAYSIZE( m_pDeferredQueueCursors ); i-- > 1; )
  1320. {
  1321. m_pDeferredQueueCursors[ i ] = m_pDeferredQueueCursors[ i - 1 ];
  1322. }
  1323. }
  1324. void CEdgePostWorkload::OnVjobsInit( VJobsRoot* pRoot )
  1325. {
  1326. uint numSpus = 5, nScratchSize = EDGE_POST_MLAA_HANDLER_SPU_BUFFER_SIZE( numSpus ) * 3;
  1327. m_pMlaaScratch = MemAlloc_AllocAligned( nScratchSize, EDGE_POST_MLAA_HANDLER_BUFFER_ALIGN );
  1328. int nOk = edgePostMlaaInitializeContext( &m_mlaaContext, numSpus, &pRoot->m_spurs, ( uint8_t* )&pRoot->m_nEdgePostWorkloadPriority, GCM_LABEL_EDGEPOSTMLAA, m_pMlaaScratch, nScratchSize );
  1329. if( nOk != CELL_OK )
  1330. {
  1331. Warning("Cannot initialize MLAA, error %d\n", nOk );
  1332. edgePostMlaaDestroyContext( &m_mlaaContext );
  1333. MemAlloc_FreeAligned( m_pMlaaScratch );
  1334. return;
  1335. }
  1336. m_isInitialized = true;
  1337. }
  1338. void CEdgePostWorkload::OnVjobsShutdown( VJobsRoot* pRoot )
  1339. {
  1340. if( m_isInitialized )
  1341. {
  1342. edgePostMlaaDestroyContext( &m_mlaaContext );
  1343. MemAlloc_FreeAligned( m_pMlaaScratch );
  1344. m_isInitialized = false;
  1345. }
  1346. }
  1347. int32_t GhostGcmCtxCallback( struct CellGcmContextData *pContext, uint32_t nCount )
  1348. {
  1349. Error("Trying to allocate %d more words in the ghost context\n", nCount );
  1350. return CELL_ERROR_ERROR_FLAG;
  1351. }
  1352. enum TruePauseStateEnum_t
  1353. {
  1354. TRUE_PAUSE_NONE,
  1355. TRUE_PAUSE_SPINNING,
  1356. TRUE_PAUSE_LOCKED0, // locked, Shoulder and X buttons down
  1357. TRUE_PAUSE_LOCKED1, // locked, Shoulder button up
  1358. TRUE_PAUSE_SINGLE_STEP
  1359. };
  1360. TruePauseStateEnum_t g_nTruePauseState = TRUE_PAUSE_NONE;
  1361. bool CSpuGcm::TruePause()
  1362. {
  1363. switch( g_nTruePauseState )
  1364. {
  1365. case TRUE_PAUSE_NONE:
  1366. g_nTruePauseState = TRUE_PAUSE_SPINNING;
  1367. case TRUE_PAUSE_SINGLE_STEP:
  1368. break; // re-entering after single step
  1369. default:
  1370. g_nTruePauseState = TRUE_PAUSE_NONE;
  1371. return false; // inconsistent state, don't try to continue
  1372. }
  1373. CmdBufferFinish(); // this'll put the end marker to the last frame.
  1374. g_spuGcmShared.m_sysring.NotifyRsxGet( g_spuGcmShared.m_eaGcmControlRegister->get );
  1375. //Assert( g_spuGcmShared.m_sysring.m_nPut == g_spuGcmShared.m_sysring.m_nEnd );
  1376. const uint nReserve = 0x1000;
  1377. if( !g_spuGcmShared.m_sysring.CanPutNoWrap( nReserve ) )
  1378. {
  1379. if( !g_spuGcmShared.m_sysring.CanWrapAndPut( nReserve ) )
  1380. {
  1381. Msg( "Cannot replay because sysring wraps around right here and you got unlucky. If you get this a lot, ask Sergiy to implement/fix wrap-around replay\n" );
  1382. return false;
  1383. }
  1384. g_spuGcmShared.WrapSequence();
  1385. }
  1386. int nReplayFrames = 2;
  1387. if( !g_spuGcmShared.CanReplayPastFrames( nReplayFrames, nReserve ) )
  1388. {
  1389. uint nSysringBytesNeeded = 0;
  1390. Warning( "Cannot replay frames: %d frames didn't fit into command buffer of %d bytes and was generated and executed in multiple passes/segments\n", nReplayFrames, g_ps3gcmGlobalState.m_nCmdSize );
  1391. return false;
  1392. }
  1393. // all relevant SPU, RSX activity ceased at this point
  1394. uintp eaEnd = g_spuGcmShared.m_sysring.EaPut();
  1395. uint32 * pEnd = (uint32*)eaEnd;
  1396. uint nIoOffsetEnd = eaEnd + g_spuGcmShared.m_nIoOffsetDelta;
  1397. //nOffsetBeginFrame = g_spuGcmShared.m_sysring.PutToEa( g_spuGcmShared.GetPastFrame(2).m_nSysringBegin ) + g_spuGcmShared.m_nIoOffsetDelta;
  1398. //uint nSurfaceFlipIndex = g_ps3gcmGlobalState.m_display.surfaceFlipIdx, nSurfaceFlipAltIndex = g_ps3gcmGlobalState.m_display.PrevSurfaceIndex();
  1399. //CPs3gcmLocalMemoryBlock &altSurface = g_ps3gcmGlobalState.m_display.surfaceColor[nSurfaceFlipAltIndex];
  1400. //V_memset( altSurface.DataInAnyMemory(), 0, altSurface.Size() );
  1401. int nCurrentReplayFrame = 1;
  1402. // Note: we probably shouldn't start with the frame rendering in the same surface as the last frame flipped..
  1403. uint32 * pReplayLabelReset = (uint32*)g_spuGcmShared.m_sysring.EaPut();
  1404. uint nReplayLabelResetIoOffset = uintp( pReplayLabelReset ) + g_spuGcmShared.m_nIoOffsetDelta;
  1405. CellGcmContextData ghostCtx;
  1406. ghostCtx.current = ghostCtx.begin = pReplayLabelReset;
  1407. uint32 * pGhostAreaEnd = ghostCtx.end = ghostCtx.begin + ( nReserve / sizeof( uint32 ) );
  1408. ghostCtx.callback = GhostGcmCtxCallback;
  1409. cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY, 0 );
  1410. uint32 * pReplayGhostArea = ghostCtx.current;
  1411. uint nReplayGhostAreaIoOffset = uintp( pReplayGhostArea ) + g_spuGcmShared.m_nIoOffsetDelta;
  1412. g_spuGcmShared.m_sysring.Put( uintp( pReplayGhostArea ) - uintp( pReplayLabelReset ) );
  1413. Assert( g_spuGcmShared.m_sysring.EaPut() == uintp( pReplayGhostArea ) );
  1414. volatile uint32 * pLabelReplay = cellGcmGetLabelAddress( GCM_LABEL_REPLAY );
  1415. *pLabelReplay = 0xFFFFFFFF;
  1416. __sync();
  1417. bool isFirstIteration = true;
  1418. do
  1419. {
  1420. g_spuGcmShared.m_eaGcmControlRegister->put = nReplayGhostAreaIoOffset;
  1421. while( *pLabelReplay != 0 )
  1422. continue;
  1423. // we're now synchronized at the beginning of ghost area
  1424. switch( g_nTruePauseState )
  1425. {
  1426. case TRUE_PAUSE_NONE:
  1427. return false;
  1428. case TRUE_PAUSE_SINGLE_STEP:
  1429. if( !isFirstIteration )
  1430. {
  1431. return true;
  1432. }
  1433. break;
  1434. }
  1435. const BeginFrameRecord_t &pastFrame = g_spuGcmShared.GetPastFrame( nCurrentReplayFrame );
  1436. int nOffsetBeginFrame = uintp( pastFrame.m_eaBegin ) + g_spuGcmShared.m_nIoOffsetDelta, nOffsetEndFrame = uintp( pastFrame.m_eaEnd ) + g_spuGcmShared.m_nIoOffsetDelta;
  1437. Msg("frame@ %X..%X ", nOffsetBeginFrame , nOffsetEndFrame );
  1438. ghostCtx.current = ghostCtx.begin = pReplayGhostArea;
  1439. ghostCtx.end = pGhostAreaEnd;
  1440. *( ghostCtx.current++ ) = CELL_GCM_JUMP( nOffsetBeginFrame ); // jump to the beginning of the frame we want to replay
  1441. uint32 nOffsetReturnFromFrame = uintp( ghostCtx.current ) + g_spuGcmShared.m_nIoOffsetDelta;
  1442. Assert( pastFrame.m_eaEnd[0] == 0 && pastFrame.m_eaEnd[1] == 0 && pastFrame.m_eaEnd[2] == 0 && pastFrame.m_eaEnd[3] == 0 ); // we expect 4 NOPs at the end of the frame
  1443. Assert( pastFrame.m_eaBegin[0] == 0 && pastFrame.m_eaBegin[1] == 0 && pastFrame.m_eaBegin[2] == 0 && pastFrame.m_eaBegin[3] == 0 ); // we expect 4 NOPs at the beginning of the frame
  1444. pastFrame.m_eaEnd[0] = CELL_GCM_JUMP( nOffsetReturnFromFrame ); // return to replay area after rendering the whole frame
  1445. cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY, 1 );
  1446. __sync();
  1447. uint32 nTickStart = __mftb(); // let's start rendering (replaying) the captured GCM frame
  1448. g_spuGcmShared.m_eaGcmControlRegister->put = uintp( ghostCtx.current ) + g_spuGcmShared.m_nIoOffsetDelta;
  1449. while( *pLabelReplay != 1 )
  1450. continue;
  1451. int nSurfaceFlipIndex = g_ps3gcmGlobalState.m_display.PrevSurfaceIndex( nCurrentReplayFrame );
  1452. Assert( nSurfaceFlipIndex >= 0 );
  1453. while ( cellGcmGetFlipStatus() != CELL_GCM_DISPLAY_FLIP_STATUS_DONE )
  1454. {
  1455. // Wait for the previous flip to completely finish
  1456. ThreadSleep( 1 );
  1457. }
  1458. cellGcmResetFlipStatus(); // Need to reset GCM flip status
  1459. // start flipping
  1460. cellGcmSetFlip( &ghostCtx, nSurfaceFlipIndex );
  1461. cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY, 2 );
  1462. int nOffsetEndOfFlip = uintp( ghostCtx.current ) + g_spuGcmShared.m_nIoOffsetDelta;
  1463. cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY, 3 ); // reset label
  1464. *( ghostCtx.current++ ) = CELL_GCM_JUMP( nReplayLabelResetIoOffset );
  1465. __sync();
  1466. g_spuGcmShared.m_eaGcmControlRegister->put = nOffsetEndOfFlip;
  1467. Msg( "[%d.%d] flip@ %X..%X. ", nCurrentReplayFrame, nSurfaceFlipIndex, nReplayGhostAreaIoOffset, nOffsetEndOfFlip );
  1468. while( *pLabelReplay != 2 )
  1469. continue;
  1470. uint32 nFrameEnd = __mftb(); Msg( "%.2f ..ms.\n", ( nFrameEnd - nTickStart ) / 79800.0f );
  1471. while ( cellGcmGetFlipStatus() != CELL_GCM_DISPLAY_FLIP_STATUS_DONE )
  1472. {
  1473. // Wait for the previous flip to completely finish
  1474. ThreadSleep( 1 );
  1475. }
  1476. uint32 nFlipEnd = __mftb(); Msg( "%.2f ms.\n", ( nFlipEnd - nTickStart ) / 79800.0f );
  1477. pastFrame.m_eaEnd[0] = CELL_GCM_METHOD_NOP;
  1478. __sync();
  1479. nCurrentReplayFrame = ( nCurrentReplayFrame + nReplayFrames - 1 ) % nReplayFrames;
  1480. int bContinueProcessing = 0;
  1481. CellPadData padData;
  1482. do
  1483. {
  1484. int nError = cellPadGetData( 0, &padData );
  1485. if( nError )
  1486. {
  1487. Msg( "Error 0x%X trying to get pad data, aborting true pause\n", nError );
  1488. g_nTruePauseState = TRUE_PAUSE_NONE;
  1489. return false;
  1490. }
  1491. else
  1492. {
  1493. if( padData.len >= 3 )
  1494. {
  1495. int isL1Down = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL2] & CELL_PAD_CTRL_R1;
  1496. int isTriangleDown = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL1] & CELL_PAD_CTRL_UP;
  1497. int isCrossDown = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL1] & CELL_PAD_CTRL_DOWN;
  1498. int isCircleDown = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL1] & CELL_PAD_CTRL_RIGHT;
  1499. bContinueProcessing = isTriangleDown; // go into infinite loop here if the triangle is down
  1500. int isLockDown = isCrossDown, isSingleStepDown = isCircleDown, isPauseDown = isL1Down;
  1501. if( g_nTruePauseState != TRUE_PAUSE_SINGLE_STEP && isSingleStepDown )
  1502. {
  1503. g_nTruePauseState = TRUE_PAUSE_SINGLE_STEP;
  1504. bContinueProcessing = false; // return to render a single step
  1505. }
  1506. switch( g_nTruePauseState )
  1507. {
  1508. case TRUE_PAUSE_LOCKED1:
  1509. case TRUE_PAUSE_LOCKED0:
  1510. if( isPauseDown )
  1511. {
  1512. if( g_nTruePauseState == TRUE_PAUSE_LOCKED1 )
  1513. {
  1514. g_nTruePauseState = TRUE_PAUSE_NONE; // second press on the shoulder releases the lock
  1515. bContinueProcessing = false;
  1516. }
  1517. }
  1518. else
  1519. {
  1520. if( g_nTruePauseState == TRUE_PAUSE_LOCKED0 )
  1521. {
  1522. g_nTruePauseState = TRUE_PAUSE_LOCKED1; // promote: shoulder isn't pressed any more
  1523. }
  1524. }
  1525. break;
  1526. case TRUE_PAUSE_SPINNING:
  1527. if( isLockDown )
  1528. {
  1529. g_nTruePauseState = TRUE_PAUSE_LOCKED0;
  1530. }
  1531. else if( isSingleStepDown )
  1532. {
  1533. g_nTruePauseState = TRUE_PAUSE_SINGLE_STEP;
  1534. bContinueProcessing = false; // do the single step
  1535. }
  1536. else if( !isPauseDown )
  1537. {
  1538. if( isFirstIteration )
  1539. {
  1540. g_nTruePauseState = TRUE_PAUSE_LOCKED1; // assume we go into locked state if L1 wasn't pressed the very first frame
  1541. }
  1542. else
  1543. {
  1544. g_nTruePauseState = TRUE_PAUSE_NONE;
  1545. bContinueProcessing = false;
  1546. }
  1547. }
  1548. break;
  1549. case TRUE_PAUSE_SINGLE_STEP:
  1550. // we skipped one render frame; go into normal spinning state as soon as the user depresses circle
  1551. if( !isSingleStepDown )
  1552. {
  1553. if( isPauseDown )
  1554. {
  1555. g_nTruePauseState = TRUE_PAUSE_SPINNING; // the shoulder is still down, so the user didn't decide yet if they want to let the game go
  1556. }
  1557. else
  1558. {
  1559. g_nTruePauseState = TRUE_PAUSE_LOCKED1; // we let the shoulder go, so it must be a locked state
  1560. }
  1561. }
  1562. break;
  1563. }
  1564. }
  1565. }
  1566. isFirstIteration = false;
  1567. }
  1568. while( bContinueProcessing );
  1569. }
  1570. while( true );
  1571. return false;
  1572. }
  1573. static ConVar spugcm_validatedeferredqueue( "spugcm_validatedeferredqueue", "0" );
  1574. void CSpuGcm::ValidateDeferredQueue()
  1575. {
  1576. #ifdef _DEBUG
  1577. if( !spugcm_validatedeferredqueue.GetBool() )
  1578. return;
  1579. uint32 * pCmdEnd = m_pDeferredChunkHead;
  1580. if( !pCmdEnd )
  1581. pCmdEnd = m_pDeferredQueueCursors[0];
  1582. pCmdEnd = m_spuDrawQueues[1].NormalizeCursor( pCmdEnd );
  1583. Assert( m_spuDrawQueues[1].IsValidCursor( pCmdEnd ) );
  1584. uint32 * pCmdBegin = m_pDeferredQueueCursors[ARRAYSIZE(m_pDeferredQueueCursors)-1];
  1585. uint nWraps = 0;
  1586. for( uint32 * pCmd = pCmdBegin; pCmd != pCmdEnd; )
  1587. {
  1588. uint nCmd = *pCmd;
  1589. if( nCmd == 0 )
  1590. {
  1591. pCmd++;
  1592. }
  1593. else if( ( nCmd & SPUDRAWQUEUE_METHOD_MASK ) == SPUDRAWQUEUE_NOPCOUNT_METHOD )
  1594. {
  1595. pCmd += 1 + ( nCmd & SPUDRAWQUEUE_NOPCOUNT_MASK );
  1596. }
  1597. else
  1598. {
  1599. Assert( IsValidDeferredHeader( nCmd ) );
  1600. Assert( nWraps == 0 || pCmd < pCmdBegin );
  1601. Assert( m_spuDrawQueues[ 1 ].IsValidCursor( pCmd ) );
  1602. uint32 * pNext = ( uint32* )pCmd[ 1 ];
  1603. Assert( m_spuDrawQueues[ 1 ].IsValidCursor( pNext ) );
  1604. if( pNext < pCmd )
  1605. {
  1606. Assert( nWraps == 0 );
  1607. nWraps++;
  1608. }
  1609. pCmd = pNext;
  1610. }
  1611. pCmd = m_spuDrawQueues[1].NormalizeCursor( pCmd );
  1612. }
  1613. #endif
  1614. }