Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

611 lines
24 KiB

  1. //========= Copyright � Valve Corporation, All rights reserved. ====//
  2. #include "tier0/platform.h"
  3. #ifdef _PS3
  4. #include "dxabstract.h"
  5. #include <sys/memory.h>
  6. #include "ps3/spugcm_shared.h"
  7. #include "fpcpatcher_spu.h"
  8. #include "cg/cg.h"
  9. #include "cg/cgBinary.h"
  10. #include "vjobs_interface.h"
  11. #include "tier0/hardware_clock_fast.h"
  12. #include "vjobs/fpcpatch_shared.h"
  13. #include "vjobs/root.h"
  14. #include "ps3/vjobutils.h"
  15. #include "tier0/microprofiler.h"
  16. #include "ps3/ps3_gcm_config.h"
  17. #include "spugcm.h"
  18. enum
  19. {
  20. PROFILE_SCE_VP_RSX = 7003,
  21. PROFILE_SCE_FP_RSX = 7004
  22. };
  23. #define GCM_MUST_SUCCEED( FUNC, ... ) do { int nError = FUNC(__VA_ARGS__); if( nError != CELL_OK ) { Error( "Error 0x%X in " #FUNC ", %s:%d\n", nError, __FILE__, __LINE__ ); } } while( 0 )
  24. DEFINE_LOGGING_CHANNEL_NO_TAGS( LOG_VJOBS, "VJOBS" );
  25. CFragmentProgramConstantPatcher_SPU g_pixelShaderPatcher; // Patches pixel shader constants
  26. CMicroProfiler g_mpBindProgram, g_mpFpcPatch2;
  27. // debug only
  28. CFragmentProgramConstantPatcher_SPU::CFragmentProgramConstantPatcher_SPU()
  29. {
  30. m_pBuffer = m_pBufferEnd = NULL;
  31. m_nIoOffsetDelta = 0; // m_pBuffer + m_nIoOffsetDelta == IO offset usable by RSX
  32. m_pPutFragmentProgram = NULL;
  33. #ifdef DEBUG_FPC_PATCHER
  34. m_bSync = ( CommandLine()->FindParm( "-fpcpsync" ) != 0 );
  35. #endif
  36. }
  37. void CFragmentProgramConstantPatcher_SPU::InitLocal( void *pBuffer, uint nSize )
  38. {
  39. m_nFpcPatchCounter = 0;
  40. m_nFpcPatchCounterOfLastSyncJob = 0;
  41. //cellGcmSetDebugOutputLevel( CELL_GCM_DEBUG_LEVEL2 );
  42. const uint nOverfetchGuard = 1024; // RSX front end prefetches up to 4k, but 1k is ( should be ) enough to avoid overfetch crashes
  43. const uint nStateBufferQwords = 1 << 12; // make space for at least 8 full batches of constants...
  44. uint nPatchStateBufferSize = ( sizeof( job_fpcpatch::FpcPatchState_t ) + sizeof( fltx4 ) * nStateBufferQwords );
  45. uint32 nBufferIoOffset;
  46. m_bFpcPatchOnPpu = ( 0 != CommandLine()->FindParm( "-fpcpatchonppu" ) );
  47. #ifdef DEBUG_FPC_PATCHER
  48. m_bTestAlwaysStateSync = ( 0 != CommandLine()->FindParm( "-fpcpstatesync" ) );
  49. #endif
  50. m_bEnableSPU = true;
  51. m_nFpcPatchSyncMask = 0;
  52. // use this passed buffer (probably from local memory) for the patched stuff
  53. m_pBuffer = ( uint32* ) pBuffer;
  54. m_pBufferEnd = ( uint32* ) ( uintp( pBuffer ) + nSize );
  55. m_nBufferLocation = CELL_GCM_LOCATION_LOCAL;
  56. m_isBufferPassedIn = true;
  57. m_state.Init( ( job_fpcpatch::FpcPatchState_t* )MemAlloc_AllocAligned( nPatchStateBufferSize, 128 ), nStateBufferQwords );
  58. GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pBuffer, &nBufferIoOffset );
  59. #ifdef DBGFLAG_ASSERT
  60. uint32 nBufferIoOffsetCheck;
  61. GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pBuffer, &nBufferIoOffsetCheck );
  62. Assert( nBufferIoOffsetCheck == nBufferIoOffset );
  63. Assert( !( nBufferIoOffsetCheck & 0x7F ) );
  64. for( uint nOffset = 0; nOffset < nSize; nOffset += 128 )
  65. {
  66. GCM_MUST_SUCCEED( cellGcmAddressToOffset, ((uint8*)m_pBuffer) + nOffset, &nBufferIoOffsetCheck );
  67. Assert( nBufferIoOffsetCheck == nBufferIoOffset + nOffset );
  68. }
  69. #endif
  70. m_nIoOffsetDelta = nBufferIoOffset - uintp( m_pBuffer );
  71. #ifdef DEBUG_FPC_PATCHER
  72. m_pSyncState = ( fltx4* ) MemAlloc_AllocAligned( sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT, 16 );
  73. V_memset( m_pSyncState, 0xCD, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT );
  74. V_memset( m_state.m_pSharedState->m_reg, 0xCD, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT );
  75. #endif
  76. ResetPut();
  77. //cellGcmSetDebugOutputLevel( CELL_GCM_DEBUG_LEVEL0 );
  78. }
  79. void CFragmentProgramConstantPatcher_SPU::Shutdown()
  80. {
  81. }
  82. void CFragmentProgramConstantPatcher_SPU::ResetPut()
  83. {
  84. m_pPutFragmentProgram = m_pBufferEnd; // reserved word for the count of constants to set
  85. }
  86. CFragmentProgramConstantPatcher_SPU::~CFragmentProgramConstantPatcher_SPU()
  87. {
  88. if( m_isBufferPassedIn )
  89. {
  90. MemAlloc_FreeAligned( m_state.m_pSharedState );
  91. }
  92. else
  93. {
  94. sys_memory_free( ( sys_addr_t )m_pBuffer );
  95. }
  96. #ifdef DEBUG_FPC_PATCHER
  97. MemAlloc_FreeAligned( m_pSyncState );
  98. #endif
  99. }
  100. void CFragmentProgramConstantPatcher_SPU::BeginScene()
  101. {
  102. m_nFpcPatchCounterAtBeginScene = m_nFpcPatchCounter;
  103. // we shouldn't have in-flight SPU jobs by now.. should we?
  104. Assert( uint( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync - m_state.m_pSharedState->m_nStartRanges ) <= m_state.m_pSharedState->m_nBufferMask + 1 );
  105. }
  106. void CFragmentProgramConstantPatcher_SPU::EndScene()
  107. {
  108. #if ENABLE_MICRO_PROFILER > 0
  109. uint nPatchCounter = m_nFpcPatchCounter - m_nFpcPatchCounterAtBeginScene;
  110. extern bool g_bDxMicroProfile;
  111. if( g_bDxMicroProfile && nPatchCounter )
  112. {
  113. g_mpBindProgram.PrintAndReset( "[BindProgram] " );
  114. g_mpFpcPatch2 .PrintAndReset( "[FpcPatch2] " );
  115. }
  116. #endif
  117. }
  118. job_fpcpatch2::FpHeader_t g_nullFpHeader = {0,0,0,0};
  119. // semantics should match cgGLSetFragmentRegisterBlock()
  120. void CFragmentProgramConstantPatcher_SPU::SetFragmentRegisterBlock( uint nStartRegister, uint nVector4fCount, const float * pConstantData )
  121. {
  122. #ifndef _CERT
  123. if ( nStartRegister >= job_fpcpatch::MAX_VIRTUAL_CONST_COUNT || nStartRegister + nVector4fCount > job_fpcpatch::MAX_VIRTUAL_CONST_COUNT )
  124. Error( "Invalid Fragment Register Block Range %u..%u\n", nStartRegister, nStartRegister + nVector4fCount );
  125. #endif
  126. #ifdef DEBUG_FPC_PATCHER
  127. if( m_bSync )
  128. {
  129. fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
  130. m_state.GetSyncState( reg );
  131. Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
  132. }
  133. uint nEnd = m_state.m_nEndOfJournalIdx;
  134. #endif
  135. // we have 4 DMA elements ( 2..6 ) to fit the constant buffer; the 1st element may have to be as small as 16 bytes.
  136. // this leaves the max constant buffer size 4 * 16kb + 16 bytes
  137. const uint nMaxUploadRangeBeforeStateSync = ( 32 * 1024 ) / sizeof( fltx4 );
  138. uint numUploadRangeQwords = m_state.m_nEndOfJournalIdx - g_spuGcmShared.m_nFpcpStartRangesAfterLastSync;
  139. ///////////////////////////////////////////////////////////////////////////
  140. //
  141. // PREPATCH MUST BE DONE IN (CTXFLUSH OR) DRAW JOB FROM NOW ON!!! g_spuGcmShared.m_nFpcpStartRangesAfterLastSync IS SYNCHRONOUS AND CORRECT THERE
  142. //
  143. //////////////////////////////////////////////////////////////////////////
  144. /*
  145. bool bPrePatch = nVector4fCount + 1 + numUploadRangeQwords > nMaxUploadRangeBeforeStateSync;
  146. if( bPrePatch )
  147. {
  148. // force state sync now
  149. if( g_spuGcmShared.m_enableStallWarnings )
  150. {
  151. Warning( "PPU-SPU Wait for RSX. SetFragmentRegisterBlock: Forced to set state on PPU, %u vectors, %u qwords in history. This is slow fallback path.\n", nVector4fCount, numUploadRangeQwords );
  152. }
  153. FpcPatch2( &g_nullFpHeader, sizeof( g_nullFpHeader ), NULL, NULL );
  154. }
  155. */
  156. if( uint nAttempts = m_state.AddRange( nStartRegister, nVector4fCount, pConstantData ) )
  157. {
  158. if( g_spuGcmShared.m_enableStallWarnings )
  159. {
  160. Warning( "PPU-SPU Wait for RSX. SetFragmentRegisterBlock: Stall, %d spins. Waiting for more memory; %d qwords, %d jobs buffered up\n", nAttempts, m_state.m_nEndOfJournalIdx - m_state.m_pSharedState->m_nStartRanges, g_spuGcmShared.m_nFpcPatchCounter - m_state.m_pSharedState->m_nThisStatePatchCounter );
  161. }
  162. }
  163. #ifdef DEBUG_FPC_PATCHER
  164. if( m_bTestAlwaysStateSync && !bPrePatch )
  165. {
  166. FpcPatch2( &g_nullFpHeader, sizeof( g_nullFpHeader ), NULL, NULL );
  167. }
  168. V_memcpy( m_pSyncState + nStartRegister, pConstantData, nVector4fCount * sizeof( fltx4 ) );
  169. if( m_bSync )
  170. {
  171. fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
  172. m_state.GetSyncState( reg );
  173. Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
  174. }
  175. #endif
  176. }
  177. //volatile int g_nDebugStage = 0;
  178. //
  179. // Match the semantics of cgGLBindProgram()
  180. // There are 2 formats of fragment shaders, see SDK docs "2. 2 Cg Compiler Options" and
  181. // in Cg Compiler User's Guide:
  182. // "7. 2 NV Binary Shader Format (VPO and FPO)"
  183. // "7. 4 Cgb File Format Specification"
  184. //
  185. void CFragmentProgramConstantPatcher_SPU::BindProgram( const struct IDirect3DPixelShader9 * psh )
  186. {
  187. MICRO_PROFILE( g_mpBindProgram );
  188. const job_fpcpatch2::FpHeader_t * prog = psh->m_data.m_eaFp;
  189. uint32 nFragmentProgramOffset = uintp( m_pPutFragmentProgram ) + m_nIoOffsetDelta;
  190. g_spuGcmShared.m_fpcpRing.UnlockRsxMemoryForSpu();
  191. m_pPutFragmentProgram = ( uint32* )g_spuGcmShared.m_fpcpRing.LockRsxMemoryForSpu( &g_spuGcmShared.m_fpcpJobChain, prog->m_nUcodeSize );
  192. nFragmentProgramOffset = uintp( m_pPutFragmentProgram ) - uintp( g_ps3gcmGlobalState.m_pLocalBaseAddress );
  193. if( !IsCert() && nFragmentProgramOffset >= g_ps3gcmGlobalState.m_nLocalSize )
  194. {
  195. Error( "Fragment program Ucode buffer offset 0x%X is at unexpected address not in local memory\n", nFragmentProgramOffset );
  196. }
  197. if ( !IsCert() && ( m_pPutFragmentProgram < m_pBuffer || m_pPutFragmentProgram >= m_pBufferEnd ) )
  198. {
  199. Error( "Fragment Program UCode buffer overflow.\n" );
  200. }
  201. #ifdef DEBUG_FPC_PATCHER
  202. if( m_bSync )
  203. {
  204. fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
  205. m_state.GetSyncState( reg );
  206. Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
  207. }
  208. #endif
  209. uint nTexControls = prog->m_nTexControls;
  210. // set jump to self
  211. GCM_CTX_RESERVE( 7 + 2 * nTexControls );
  212. uint32 * pJts = NULL;
  213. FpcPatch2( prog, psh->m_data.m_nFpDmaSize, m_pPutFragmentProgram, pJts );
  214. CELL_GCM_METHOD_SET_SHADER_CONTROL( GCM_CTX->current, prog->m_nShaderControl0 ); // +2
  215. CELL_GCM_METHOD_SET_SHADER_PROGRAM( GCM_CTX->current, m_nBufferLocation + 1, ( nFragmentProgramOffset & 0x1fffffff ) ); // +2
  216. CELL_GCM_METHOD_SET_VERTEX_ATTRIB_OUTPUT_MASK( GCM_CTX->current, psh->m_data.m_attributeInputMask | 0x20 ); // +2
  217. V_memcpy( GCM_CTX->current, prog->GetTexControls(), nTexControls * sizeof( uint32 ) * 2 );
  218. GCM_CTX->current += 2 * nTexControls;
  219. #ifdef DEBUG_FPC_PATCHER
  220. if( m_bSync )
  221. {
  222. g_ps3gcmGlobalState.CmdBufferFlush( CPs3gcmGlobalState::kFlushForcefully );
  223. while ( *( volatile uint32* )pJts )
  224. {
  225. sys_timer_usleep( 50 );// wait for nop
  226. }
  227. #ifdef DEBUG_FPC_PATCHER
  228. {
  229. fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
  230. m_state.GetSyncState( reg );
  231. Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
  232. }
  233. ValidatePatchedProgram( psh->m_pCgProg, m_pPutFragmentProgram );
  234. uint32 nFragmentProgramOffsetCheck;
  235. GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pPutFragmentProgram, &nFragmentProgramOffsetCheck );
  236. Assert( nFragmentProgramOffsetCheck == nFragmentProgramOffset );
  237. #endif
  238. g_ps3gcmGlobalState.CmdBufferFinish();
  239. }
  240. #endif
  241. m_nFpcPatchCounter++;
  242. }
  243. uint g_nFpcPatch2JobExtraFlags = 0; // set this to 2 and SPU will break
  244. static int s_nFpcPatch2Calls = 0;
  245. void CFragmentProgramConstantPatcher_SPU::FpcPatch2( const job_fpcpatch2::FpHeader_t * prog, uint nFpDmaSize, void *pPatchedProgram, uint32 * pJts )
  246. {
  247. MICRO_PROFILE( g_mpFpcPatch2 );
  248. #ifdef VJOBS_ON_SPURS
  249. VjobChain3 &jobChain = g_spuGcm.m_jobSink;
  250. uint32 nUCodeSize = prog->m_nUcodeSize;
  251. CellSpursJob128 * pJob = g_spuGcm.m_jobPool128.Alloc( *g_spuGcm.m_pRoot->m_pFpcPatch2 );
  252. Assert( pJob->header.sizeDmaList == 0 && pJob->header.sizeInOrInOut == 0 ); // the default MUST always be 1
  253. pJob->header.useInOutBuffer = 1;
  254. CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
  255. dmaConstructor.AddInputDma( nFpDmaSize, prog );
  256. dmaConstructor.AddInputDma( sizeof( *m_state.m_pSharedState ), ( void* )m_state.m_pSharedState );
  257. // the g_spuGcmShared.m_nFpcpStartRangesAfterLastSync runs ahead of m_state.m_pSharedState->m_nStartRanges , because it's a PREDICTED
  258. // start of range. It'll be absolutely in-sync with m_state.m_pSharedState->m_nStartRanges if we run SPUs synchronously
  259. #ifdef DBGFLAG_ASSERT
  260. uint nSharedStateStartRanges = m_state.m_pSharedState->m_nStartRanges;
  261. #endif
  262. // NOTE: if the asserts below fire, it may be due to invalid value in nSharedStateStartRanges because SPU DMAs stuff right down to m_state.m_pSharedState and it's changing while this code executes
  263. Assert( uint( m_state.m_nEndOfJournalIdx - nSharedStateStartRanges ) <= m_state.m_pSharedState->m_nBufferMask + 1 );
  264. Assert( uint( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync - nSharedStateStartRanges ) <= uint( m_state.m_nEndOfJournalIdx - nSharedStateStartRanges ) );
  265. uint nStartOfJournal = /*nSharedStateStartRanges*/g_spuGcmShared.m_nFpcpStartRangesAfterLastSync, nBufferMask = m_state.m_pSharedState->m_nBufferMask;
  266. // we have 4 DMA elements ( 2..6 ) to fit the constant buffer; the 1st element may have to be as small as 16 bytes.
  267. // this leaves the max constant buffer size 4 * 16kb + 16 bytes
  268. const uint numRangeQwords = ( m_state.m_nEndOfJournalIdx - nStartOfJournal );
  269. Assert( numRangeQwords <= nBufferMask + 1 );
  270. if ( numRangeQwords != 0 )
  271. {
  272. uint nEndOfSpan0 = ( nStartOfJournal + nBufferMask + 1 ) & ~nBufferMask;
  273. if ( ( signed int )( nEndOfSpan0 - m_state.m_nEndOfJournalIdx ) >= 0 )
  274. {
  275. //numRangeQwords = ( m_state.m_nEndOfJournalIdx - nStartOfJournal );
  276. dmaConstructor.AddInputDmaLarge( ( numRangeQwords ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() + ( nStartOfJournal & nBufferMask ) );
  277. }
  278. else
  279. {
  280. //numRangeQwords = nFirstRange + nSecondRange ;
  281. dmaConstructor.AddInputDmaLarge( ( nEndOfSpan0 - nStartOfJournal ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() + ( nStartOfJournal & nBufferMask ) );
  282. dmaConstructor.AddInputDmaLarge( ( m_state.m_nEndOfJournalIdx - nEndOfSpan0 ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() );
  283. }
  284. }
  285. else
  286. {
  287. dmaConstructor.AddSizeInOrInOut( 16 ); // we need at least 16 bytes in the ranges area for temporary storage
  288. }
  289. dmaConstructor.FinishIoBuffer( &pJob->header );
  290. if( pJob->header.sizeDmaList > 7 * sizeof( uint64 ) )
  291. {
  292. Error( "FpcPatch2: DMA list size out of range (%d). job_fpcpatch2 parameters won't fit. numRangeQwords = %d\n", pJob->header.sizeDmaList, numRangeQwords );
  293. }
  294. // IMPORTANT: make it always synchronous , in case we don't have the target to patch. The only reason for this job to exist is to make it synchronous
  295. // Also, if the range is large, still make it synchronous, to avoid subsequent jobs doing a lot of computations in vein
  296. uint nAsync = !pPatchedProgram || numRangeQwords >= 1024 ? 0 : ( ( m_nFpcPatchCounter ) & m_nFpcPatchSyncMask ) ;
  297. dmaConstructor[7][0] = m_nFpcPatchCounterOfLastSyncJob;
  298. dmaConstructor[7][1] = m_nFpcPatchCounter;
  299. dmaConstructor[8][0] = ( uint32 ) pPatchedProgram;
  300. dmaConstructor[8][1] = uintp( pJts ); // the SPU->RSX dma element; may be NULL
  301. dmaConstructor[9][0] = m_state.m_nEndOfJournalIdx;
  302. dmaConstructor[9][1] = ( uint32 ) nStartOfJournal;
  303. if( !IsCert() )
  304. {
  305. pJob->header.jobType |= CELL_SPURS_JOB_TYPE_MEMORY_CHECK;
  306. }
  307. dmaConstructor[8][0] |= g_nFpcPatch2JobExtraFlags;
  308. if ( !nAsync )
  309. {
  310. dmaConstructor[8][0] |= job_fpcpatch::FLAG_PUT_STATE;
  311. m_nFpcPatchCounterOfLastSyncJob = m_nFpcPatchCounter;
  312. pJob->header.jobType |= CELL_SPURS_JOB_TYPE_STALL_SUCCESSOR;
  313. g_spuGcmShared.m_nFpcpStartRangesAfterLastSync = m_state.m_nEndOfJournalIdx;
  314. }
  315. #ifdef DBGFLAG_ASSERT
  316. int nError = cellSpursCheckJob( ( const CellSpursJob256* )pJob, sizeof( *pJob ), 256 );
  317. static int s_nJobErrors = 0;
  318. if( CELL_OK != nError )
  319. {
  320. ++s_nJobErrors;
  321. }
  322. #endif
  323. if ( !nAsync )
  324. {
  325. jobChain.PushSyncJobSync( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
  326. }
  327. else
  328. {
  329. jobChain.Push( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
  330. }
  331. #ifdef DEBUG_FPC_PATCHER
  332. if( m_bSync )
  333. {
  334. if( pJts )
  335. {
  336. volatile uint32 * pJts2 = pJts;
  337. while( *pJts2 )
  338. continue;
  339. }
  340. volatile uint64_t * pEaJob = &pJob->header.eaBinary;
  341. while( * pEaJob )
  342. continue;
  343. }
  344. #endif
  345. s_nFpcPatch2Calls++;
  346. #endif
  347. }
  348. #ifdef DEBUG_FPC_PATCHER
  349. extern void PatchUcodeConstSwap( uint32 * pDestination, const uint32 * pSource, int nLength );
  350. extern uint fspatchGetLength( CGtype nType );
  351. uint32 g_nConstLengthCounter[5] = { 0, 0, 0, 0, 0 };
  352. void CFragmentProgramConstantPatcher_SPU::ValidatePatchedProgram( const CgBinaryProgram * prog, void * pPatchedUcode )
  353. {
  354. Assert( prog->profile == PROFILE_SCE_FP_RSX && prog->binaryFormatRevision == CG_BINARY_FORMAT_REVISION );
  355. uint32 nUCodeSize = prog->ucodeSize;
  356. void * pUcode = stackalloc( nUCodeSize );
  357. void * pSourceUcode = ( ( uint8* ) prog ) + prog->ucode;
  358. V_memcpy( pUcode, ( ( uint8* ) prog ) + prog->ucode, nUCodeSize );
  359. CgBinaryParameter * pParameters = ( CgBinaryParameter * )( uintp( prog ) + prog->parameterArray ) ;
  360. uint32 * pPatchDestination = NULL;
  361. Assert( cellGcmCgGetCountParameter( ( CGprogram ) prog ) == prog->parameterCount );
  362. for ( int nPar = 0; nPar < prog->parameterCount; ++nPar )
  363. {
  364. CgBinaryParameter * pPar = pParameters + nPar;
  365. Assert( pPar == ( CgBinaryParameter * ) cellGcmCgGetIndexParameter( ( CGprogram ) prog, nPar ) );
  366. #ifdef DBGFLAG_ASSERT
  367. const char * pLeafName = ( const char * )( uintp( prog ) + pPar->name );
  368. ( void )pLeafName;
  369. uint32 * pDefault = pPar->defaultValue ? ( uint32* )( uintp( prog ) + pPar->defaultValue ) : NULL ;
  370. #endif
  371. if ( pPar->embeddedConst )
  372. {
  373. Assert( pPar->res == CG_C && pPar->var == CG_UNIFORM ); // this MUST be a uniform constant.. at least I think that's the only kind we need to patch
  374. const CgBinaryEmbeddedConstant * pEmbedded = ( const CgBinaryEmbeddedConstant* )( uintp( prog ) + pPar->embeddedConst );
  375. int nLength = fspatchGetLength( pPar->type );
  376. g_nConstLengthCounter[nLength] ++;
  377. for ( uint nEm = 0; nEm < pEmbedded->ucodeCount; ++ nEm )
  378. {
  379. uint ucodeOffset = pEmbedded->ucodeOffset[nEm]; // is this the offset from prog structure start?
  380. Assert( ucodeOffset < nUCodeSize - 4 );
  381. #ifdef DBGFLAG_ASSERT
  382. Assert( cellGcmCgGetEmbeddedConstantOffset( ( CGprogram ) prog, ( CGparameter ) pPar, nEm ) == ucodeOffset );
  383. const float * pDefaultCheck = cellGcmCgGetParameterValues( ( CGprogram ) prog, ( CGparameter ) pPar );
  384. Assert( pDefault == ( uint32* ) pDefaultCheck );
  385. uint32 * pUcodeEmConst = ( uint32* )( uintp( pSourceUcode ) + ucodeOffset );
  386. Assert( !pDefault || !V_memcmp( pDefault, pUcodeEmConst, nLength * 4 ) );
  387. #endif
  388. pPatchDestination = ( uint32* )( uintp( pUcode ) + ucodeOffset );
  389. uint32 * pPatchedCheck = ( uint32* )( uintp( pPatchedUcode ) + ucodeOffset );
  390. PatchUcodeConstSwap( pPatchDestination, ( uint32* ) & ( m_pSyncState[pPar->resIndex] ), nLength );
  391. Assert( !V_memcmp( pPatchDestination, pPatchedCheck, nLength * 4 ) );
  392. }
  393. }
  394. }
  395. Assert( !V_memcmp( pPatchedUcode, pUcode, nUCodeSize ) );
  396. }
  397. #endif
  398. void FpcPatchState::Init( job_fpcpatch::FpcPatchState_t * pSharedState, uint32 nBufferQwords )
  399. {
  400. #ifdef _DEBUG
  401. //m_nRangesAdded = 0;
  402. #endif
  403. pSharedState->m_nBufferMask = m_nBufferMask = nBufferQwords - 1;
  404. pSharedState->m_nStartRanges = m_nEndOfJournalIdx = IsCert() ? 0 : nBufferQwords - 128;
  405. pSharedState->m_eaThis = m_pSharedState = pSharedState;
  406. pSharedState->m_nThisStatePatchCounter = 0;
  407. pSharedState->m_nDebuggerBreak = 0;
  408. }
  409. void FpcPatchState::GetSyncState( fltx4 * pRegisters )
  410. {
  411. V_memcpy( pRegisters, m_pSharedState->m_reg, job_fpcpatch:: MAX_VIRTUAL_CONST_COUNT * sizeof( fltx4 ) );
  412. for( uint nJournalIdx = m_pSharedState->m_nStartRanges; nJournalIdx < m_nEndOfJournalIdx ; )
  413. {
  414. job_fpcpatch:: ConstRangeHeader_t & range = ((job_fpcpatch::ConstRangeHeader_t*)m_pSharedState->GetBufferStart())[ nJournalIdx & m_pSharedState->m_nBufferMask ];
  415. nJournalIdx++;
  416. for( uint nConstIdx = 0 ; nConstIdx < range.m_u32.m_nCount; ++nConstIdx, ++nJournalIdx )
  417. {
  418. pRegisters[ range.m_u32.m_nStart + nConstIdx ] = m_pSharedState->GetBufferStart()[nJournalIdx & m_pSharedState->m_nBufferMask ];
  419. }
  420. }
  421. }
  422. /*
  423. void FpcPatchState::Reset()
  424. {
  425. m_nEndOfJournalIdx = 0;
  426. m_pSharedState->m_nStartRanges = 0;
  427. }
  428. */
  429. #ifdef _DEBUG
  430. static int s_nDebugRangeAdd = -1, s_nDebugSetConst = -1;
  431. #endif
  432. uint FpcPatchState::AddRange( uint32 nStart, uint32 nCount, const float * pData )
  433. {
  434. #ifndef _CERT
  435. if( nStart + nCount > job_fpcpatch::MAX_VIRTUAL_CONST_COUNT )
  436. {
  437. Error( "AddRange(%d..%d) out of range <%d\n", nStart, nCount, int( job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
  438. }
  439. #endif
  440. #ifdef _DEBUG
  441. //Assert( s_nDebugRangeAdd != m_nRangesAdded );
  442. if( int( s_nDebugSetConst - nStart ) >= 0 && int( s_nDebugSetConst - nStart ) < int( nCount ) )
  443. {
  444. fltx4 flDebugRegister = LoadUnalignedSIMD( pData + 4 * int( s_nDebugSetConst - nStart ) );
  445. DebuggerBreak();
  446. }
  447. //++m_nRangesAdded;
  448. #endif
  449. // spin-wait, then V_memcpy range
  450. COMPILE_TIME_ASSERT( sizeof( job_fpcpatch::ConstRangeHeader_t ) == 16 );
  451. const uint nSpins = 0x1FF;
  452. Assert( !( nSpins & ( nSpins + 1 ) ) );
  453. //
  454. // We need space for nCount + 1 QWords (1 Qword for the ConstRangeHeader_t)
  455. // And we need m_nEndOfJournalIdx != m_nStartRanges to distinguish between
  456. // the all-empty and all-full buffers
  457. //
  458. uint nAttempts = 0;
  459. for ( ; ; ++nAttempts )
  460. {
  461. uint32 nStartRanges = m_pSharedState->m_nStartRanges;
  462. Assert( uint32( m_nEndOfJournalIdx - nStartRanges ) <= m_nBufferMask + 1 );
  463. // compute the new end - start; is it running further than buffer size away?
  464. if ( ( m_nEndOfJournalIdx + nCount - ( nStartRanges + m_nBufferMask + 1 ) ) & 0x80000000 )
  465. { // no, the comparison is negative, therefore it's safe to fill it in
  466. break;
  467. }
  468. // if ( ( nAttempts & nSpins ) == nSpins )
  469. {
  470. // the caller prints warning about this stall.
  471. sys_timer_usleep( 60 ); // TODO: proper spinwait; proper OS syncronization
  472. if( nAttempts == ( 1000000 / 60 ) )
  473. {
  474. // waiting for a second already ...
  475. Warning(
  476. "***************************************************************************************************************\n"
  477. "* SPU hang in FpcPatchState::AddRange(). Please send this log (including a couple of screens above) to Sergiy *\n"
  478. );
  479. Msg( "AddRange(%d,%d,%p), ", nStart, nCount, pData );
  480. Msg( "SharedState @%p {start=0x%X&0x%X,patch=%X,job=%X},", m_pSharedState, m_pSharedState->m_nStartRanges, m_pSharedState->m_nBufferMask, m_pSharedState->m_nThisStatePatchCounter, m_pSharedState->m_eaThisStateJobDescriptor );
  481. Msg( "FpcpState @%p {end=0x%X},", this, this->m_nEndOfJournalIdx );
  482. Msg( "SpuGcmShared trace {0x%X,0x%X,0x%X}\n", g_spuGcmShared.m_nFpcPatchCounterOfLastSyncJob, g_spuGcmShared.m_nFpcPatchCounter, g_spuGcmShared.m_nFpcpStartRangesAfterLastSync );
  483. Msg( "RSX put=%X, get=%X sysring{put=%X,end=%X}\n", g_spuGcmShared.m_eaGcmControlRegister->put, g_spuGcmShared.m_eaGcmControlRegister->get,
  484. g_spuGcmShared.m_sysring.m_nPut, g_spuGcmShared.m_sysring.m_nEnd );
  485. Msg( "last JTS ret guard patched @%X, ", *cellGcmGetLabelAddress( GCM_LABEL_DEBUG_FPCP_RING ) );
  486. Msg( "ringRsx[%d]:", g_spuGcmShared.m_fpcpRing.m_ringRsx.Count() );
  487. for( int i = 0; i < g_spuGcmShared.m_fpcpRing.m_ringRsx.Count(); ++i )
  488. {
  489. RsxSpuDoubleRing::Segment_t & segment = g_spuGcmShared.m_fpcpRing.m_ringRsx[i];
  490. Msg(" {%X,%p,%s}", segment.m_eaBase, segment.m_pSpuJts, *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_LWSYNC ? "LWSYNC" : *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_JTS ? "JTS" : "ERROR" );
  491. }
  492. Msg( "\nringSpu[%d]:", g_spuGcmShared.m_fpcpRing.m_ringSpu.Count() );
  493. for( int i = 0; i < g_spuGcmShared.m_fpcpRing.m_ringSpu.Count(); ++i )
  494. {
  495. RsxSpuDoubleRing::Segment_t & segment = g_spuGcmShared.m_fpcpRing.m_ringSpu[i];
  496. Msg(" {%X,%p,%s}", segment.m_eaBase, segment.m_pSpuJts, *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_LWSYNC ? "LWSYNC" : *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_JTS ? "JTS" : "ERROR" );
  497. }
  498. Msg( "***************************************************************************************************************\n" );
  499. }
  500. }
  501. }
  502. // we have enough free buffer to insert stuff
  503. job_fpcpatch::ConstRangeHeader_t *hdr = (job_fpcpatch::ConstRangeHeader_t *)AddInternalPtr();
  504. hdr->m_u32.m_nStart = nStart;
  505. hdr->m_u32.m_nCount = nCount;
  506. // add constants block
  507. AddInternalBlock( pData, nCount );
  508. return nAttempts;
  509. }
  510. #endif