//========= Copyright © 1996-2006, Valve Corporation, All rights reserved. ============// // // Purpose: Proxy for D3DX routines // // $NoKeywords: $ // //=============================================================================// // #define WIN32_LEAN_AND_MEAN #include #include #include #include #include #include "../../dx10sdk/include/d3dx10.h" typedef D3D10_SHADER_MACRO D3DXMACRO; typedef LPD3D10INCLUDE LPD3DXINCLUDE; typedef ID3D10Include ID3DXInclude; typedef D3D10_INCLUDE_TYPE D3DXINCLUDE_TYPE; typedef ID3D10Blob* LPD3DXBUFFER; typedef void* LPD3DXCONSTANTTABLE; #include "filememcache.h" #include "dxincludeimpl.h" #include "cgc.h" #include "SCEShaderPerf.h" typedef unsigned int uint; typedef unsigned __int64 uint64; #include "../../public/ps3shaderoptimizer/ps3optimalschedulesfmt.h" const int g_nRandSched[] = { // List of 17 good scheduler settings, found empirically by Sony. 8, 10, 15, 4, 32, 2, 1, 64, 13, 14, 16, 17, 18, 19, 128, 256, 512, // Extra 6 scheduler settings 6, 100, 192, 3, 384, 24 }; #define NUM_RANDOM_SCHEDULE_VALUES ARRAYSIZE( g_nRandSched ) #define NUM_RANDOM_SCHEDULE_SEEDS 12 // Faster settings, for testing purposes (currently takes around 11 minutes): //#define NUM_RANDOM_SCHEDULE_VALUES 8 //#define NUM_RANDOM_SCHEDULE_SEEDS 1 //#define NUM_RANDOM_SCHEDULE_VALUES 1 //#define NUM_RANDOM_SCHEDULE_SEEDS 1 #define CGC_COMPILER_OPTIMIZATION_LEVEL 1 // Aux function prototype const char * WINAPI GetDllVersion( void ); void* CgMalloc( void* arg, size_t size ); // Memory allocation callback void CgFree( void* arg, void* ptr ); // Memory freeing callback HANDLE g_mutexDebug = NULL; void DebugLog( const char * pMsg, ...) { (void)pMsg; #ifdef _DEBUG FILE * f = fopen( "c:\\dx_proxy_ps3.log", "at" ); if( f ) { if( g_mutexDebug ) WaitForSingleObject( g_mutexDebug, INFINITE ); va_list args; va_start(args,pMsg); SYSTEMTIME lt; GetLocalTime( < ); fprintf( f, "%02d:%02d:%02d.%04d[%d.%d]", lt.wHour, lt.wMinute, lt.wSecond, lt.wMilliseconds, GetCurrentProcessId(), GetCurrentThreadId() ); vfprintf(f, pMsg, args ); fputs( "\n", f ); va_end( args ); fclose( f ); if( g_mutexDebug ) ReleaseMutex( g_mutexDebug ); } #endif } // // ExtractDependencies // // Retrieves all the additional required binaries from the resources and // places them to a temporary location. Then the binaries are mapped into // the address space of the calling process. // static BOOL ExtractDependencies( void ) { return TRUE; } class CgContextWrapper { public: CGCcontext *m_cgc; CgContextWrapper() { CGCmem mem; mem.malloc = CgMalloc; mem.free = CgFree; m_cgc = sceCgcNewContext( &mem ); } ~CgContextWrapper() { sceCgcDeleteContext( m_cgc ); } operator CGCcontext * () { return m_cgc ; } }; // DLL entry point: DllMain BOOL WINAPI DllMain( HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved ) { /*UNUSED_ALWAYS*/( hinstDLL ); /*UNUSED_ALWAYS*/( lpvReserved ); switch ( fdwReason ) { case DLL_PROCESS_ATTACH: { g_mutexDebug = CreateMutex( NULL, FALSE, "DxProxyPs3DebugLog" ); } // Process is attaching - make sure it can find the dependencies return ExtractDependencies(); case DLL_PROCESS_DETACH: if( g_mutexDebug ) CloseHandle( g_mutexDebug ); break; } return TRUE; } // Obtain DLL version #pragma comment(linker, "/EXPORT:GetDllVersionLong=?GetDllVersionLong@@YGPBDXZ") const char * WINAPI GetDllVersionLong( void ) { #if defined( _DEBUG ) return "{DX_PROXY for PS3_V00_PC DEBUG}"; #else return "{DX_PROXY for PS3_V00_PC RELEASE}"; #endif } #pragma comment(linker, "/EXPORT:GetDllVersion=?GetDllVersion@@YGPBDXZ") const char * WINAPI GetDllVersion( void ) { #ifdef _DEBUG return "DXPRX_PS3_V00_d"; #else return "DXPRX_PS3_V00_r"; #endif } LPD3DXINCLUDE g_pInclude = NULL; uint g_nCgAllocated = 0; int CgcIncludeOpen( SCECGC_INCLUDE_TYPE type, const char* filename, char** data, size_t* size ) { D3DXINCLUDE_TYPE typeD3d = D3D10_INCLUDE_LOCAL; if( type == SCECGC_SYSTEM_INCLUDE ) typeD3d = D3D10_INCLUDE_SYSTEM; HRESULT hr = g_pInclude->Open( typeD3d, filename, NULL, (LPCVOID*)data, size ); return ( S_OK == hr ); } void* CgMalloc( void* arg, size_t size ) // Memory allocation callback { g_nCgAllocated += size; uint * pData = (uint*)malloc( size + sizeof( uint ) ); *pData = size; //DebugLog("alloc %d->%p", size, pData+1); return pData + 1; } void CgFree( void* arg, void* ptr ) // Memory freeing callback { uint * pData = ( ( uint* ) ptr ) - 1; //if( *pData > 0x1000000 && IsDebuggerPresent() ) // _asm{int 3 ;}; //DebugLog("free %p->%u", ptr, *pData); g_nCgAllocated -= *pData; free( pData ); } // // return values: // 1 - Include file successfully closed. // // 0 - Failure closing an include file. // int CgcIncludeClose( const char* data ) { HRESULT hr = g_pInclude->Close( data ); return ( S_OK == hr ); } class BlobAdaptor: public ID3D10Blob { public: uint m_nRefCount; CGCbin *m_bin; char * m_pMemory; uint m_nSize; BlobAdaptor( ID3D10Blob * pLeft, ID3D10Blob * pRight ) { m_bin = NULL; m_nRefCount = 1; m_nSize = pLeft->GetBufferSize( ) + pRight->GetBufferSize() ; m_pMemory = new char [m_nSize + 1]; memcpy(m_pMemory, pLeft->GetBufferPointer(), pLeft->GetBufferSize( )); memcpy(m_pMemory + pLeft->GetBufferSize(), pRight->GetBufferPointer(), pRight->GetBufferSize( ) ); m_pMemory[m_nSize] = '\0'; } BlobAdaptor() { m_pMemory = NULL; m_nSize = 0; CGCmem mem; mem.malloc = CgMalloc; mem.free = CgFree; m_bin = sceCgcNewBin( &mem ); m_nRefCount = 1; } ~BlobAdaptor() { if( m_bin ) sceCgcDeleteBin( m_bin ); if( m_pMemory ) delete[]m_pMemory; } void Bake() { if( m_bin ) { m_nSize = sceCgcGetBinSize( m_bin ); m_pMemory = new char [m_nSize + 1]; memcpy( m_pMemory, sceCgcGetBinData( m_bin ), m_nSize ); m_pMemory[m_nSize] = '\0'; sceCgcDeleteBin( m_bin ); m_bin = NULL; } } STDMETHOD(QueryInterface)(THIS_ REFIID iid, __deref_out LPVOID *ppv) { if( iid == IID_IUnknown || iid == IID_ID3D10Blob ) { AddRef(); *ppv = this; return S_OK; } *ppv = NULL; return E_NOINTERFACE; } STDMETHOD_( ULONG, AddRef )(THIS) { return ++m_nRefCount; } STDMETHOD_( ULONG, Release )(THIS) { if( --m_nRefCount ) return m_nRefCount; delete this; return 0; } // ID3DXBuffer STDMETHOD_(__out LPVOID, GetBufferPointer)(THIS) { if( m_bin ) return sceCgcGetBinData( m_bin ); else return m_pMemory; } STDMETHOD_(DWORD, GetBufferSize)(THIS) { if( m_bin ) return sceCgcGetBinSize( m_bin ); else return m_nSize; } }; static inline bool operator< ( const SceSpMeasurementResult& target, const SceSpMeasurementResult& reference ) { if ( target.nResult != SCESP_OK ) return false; if ( target.nCycles < reference.nCycles ) return true; else if ( target.nCycles == reference.nCycles ) { if ( target.nRRegisters < reference.nRRegisters ) return true; else return false; } else return false; } // Use the Win32 crypto API to create a 64-bit GUID. (This sucks, but it avoids creating dependencies against tier0/tier1 into a DLL that is not expected to have such dependencies.) static uint64 CreateGUID64() { uint64 nResult = 0; HCRYPTPROV hCryptProv; if ( CryptAcquireContext(&hCryptProv, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT | CRYPT_MACHINE_KEYSET ) ) { CryptGenRandom( hCryptProv, sizeof( nResult ), (BYTE*)&nResult ); CryptReleaseContext( hCryptProv, 0 ); } return nResult; } static bool HashBuffer( const void *pBuf, uint nLen, uint64 &nHashLow, uint64 &nHashHigh ) { bool bResult = false; nHashLow = 0; nHashHigh = 0; HCRYPTPROV hCryptProv; if ( CryptAcquireContext(&hCryptProv, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT | CRYPT_MACHINE_KEYSET ) ) { HCRYPTHASH hHash; if ( CryptCreateHash( hCryptProv, CALG_MD5, 0, 0, &hHash ) ) { if ( CryptHashData( hHash, static_cast< const BYTE * >( pBuf ), nLen, 0 ) ) { BYTE bHash[16]; DWORD dwHashLen = 16; if ( CryptGetHashParam( hHash, HP_HASHVAL, bHash, &dwHashLen, 0 ) ) { nHashLow = *reinterpret_cast< uint64 * >( &bHash[0] ); nHashHigh = *reinterpret_cast< uint64 * >( &bHash[8] ); bResult = true; } } CryptDestroyHash( hHash ); } CryptReleaseContext( hCryptProv, 0 ); } return bResult; } static uint64 ComputeComboHash( LPCSTR pSrcFile, CONST D3DXMACRO *pDefines, LPCSTR pFunctionName ) { std::vector< std::string > defines; CONST D3DXMACRO *pCurDefine = pDefines; while ( ( pCurDefine->Name ) && ( pCurDefine->Definition ) ) { char buf[1024]; sprintf_s( buf, sizeof( buf ), "%s=%s", pCurDefine->Name, pCurDefine->Definition ); defines.push_back( std::string( buf ) ); pCurDefine++; } std::sort( defines.begin(), defines.end() ); std::vector< uint8 > shaderSigBuf; shaderSigBuf.reserve( 1024 ); shaderSigBuf.insert( shaderSigBuf.end(), (uint8 *)pSrcFile, (uint8 *)pSrcFile + strlen( pSrcFile ) ); shaderSigBuf.insert( shaderSigBuf.end(), (uint8 *)pFunctionName, (uint8 *)pFunctionName + strlen( pFunctionName ) ); for ( uint i = 0; i < defines.size(); ++i ) { const char *pDefineStr = defines[i].c_str(); shaderSigBuf.insert( shaderSigBuf.end(), (uint8 *)pDefineStr, (uint8 *)pDefineStr + strlen( pDefineStr ) ); } uint64 nHashLow = 0, nHashHigh = 0; if ( shaderSigBuf.size() ) { HashBuffer( &shaderSigBuf[0], shaderSigBuf.size(), nHashLow, nHashHigh ); } return nHashLow ^ nHashHigh; } static void WriteToCompileLogFile( const char *pMsg ) { char szLogFilename[MAX_PATH]; if ( !GetEnvironmentVariableA( "PS3COMPILELOG", szLogFilename, sizeof( szLogFilename ) ) ) return; HANDLE hMutex = CreateMutex( NULL, FALSE, "PS3COMPILELOGMUTEX" ); if ( ( hMutex == NULL ) || ( WaitForSingleObject( hMutex, 10000 ) != WAIT_OBJECT_0 ) ) return; FILE *pFile = fopen( szLogFilename, "a+" ); if ( !pFile ) { ReleaseMutex( hMutex ); return; } fputs( pMsg, pFile ); fclose( pFile ); ReleaseMutex( hMutex ); } static void UpdateCompileLogFile( LPCSTR pSrcFile, uint64 nComboHash, const SceSpMeasurementResult &origStatistics, const SceSpMeasurementResult &bestStatistics, int nBestSchedule, uint nBestSeed, int nShaderSchedulerSourceIndex, int nDbgStatusIndex ) { char szComputerName[512]; DWORD nSize = sizeof( szComputerName ); GetComputerNameA( szComputerName, &nSize ); uint64 nGUID = CreateGUID64(); char msg[1024]; sprintf_s( msg, sizeof( msg ), "%s,%016I64X,\"%s\",%016I64X,%u,%u,%u,%u,%i,%i,%i,%i\n", szComputerName, nGUID, pSrcFile, nComboHash, origStatistics.nCycles, origStatistics.nRRegisters, bestStatistics.nCycles, bestStatistics.nRRegisters, nBestSchedule, nBestSeed, nShaderSchedulerSourceIndex, nDbgStatusIndex ); WriteToCompileLogFile( msg ); } class COptimalComboFile { public: COptimalComboFile() : g_bTriedToLoadOptimalCombos( false ) { InitializeCriticalSection( &m_CS ); } ~COptimalComboFile() { DeleteCriticalSection( &m_CS ); } bool Load( const char *pFilename ) { Lock(); if ( g_OptimalCombos.empty() ) { if ( g_bTriedToLoadOptimalCombos ) { Unlock(); return false; } g_bTriedToLoadOptimalCombos = true; FILE *pFile = fopen( pFilename, "rb" ); if ( !pFile ) { Unlock(); return false; } fseek( pFile, 0, SEEK_END ); const uint nFilesize = ftell( pFile ); fseek( pFile, 0, SEEK_SET ); g_OptimalCombos.resize( nFilesize ); if ( fread( &g_OptimalCombos[0], nFilesize, 1, pFile) != 1 ) { fclose( pFile ); g_OptimalCombos.clear(); Unlock(); return false; } fclose( pFile ); const OptimalComboScheduleFileHeader_t *pHeader = reinterpret_cast< const OptimalComboScheduleFileHeader_t * >( &g_OptimalCombos[0] ); if ( ( pHeader->m_nID != OPTIMAL_COMBO_SCHEDULE_FILE_HEADER_ID ) || ( !pHeader->m_nNumCombos ) ) { g_OptimalCombos.clear(); Unlock(); return false; } } Unlock(); return true; } bool GetOptimalScheduleForCombo( uint64 nComboHash, int &nBestSchedule, int &nBestSeed, SceSpMeasurementResult &bestStatistics ) { if ( g_OptimalCombos.empty() ) return false; const OptimalComboScheduleFileHeader_t *pHeader = reinterpret_cast< const OptimalComboScheduleFileHeader_t * >( &g_OptimalCombos[0] ); const OptimalComboScheduleFileRecord_t *pCombos = reinterpret_cast< const OptimalComboScheduleFileRecord_t * >( &g_OptimalCombos[sizeof( OptimalComboScheduleFileHeader_t )] ); int low = 0; int high = pHeader->m_nNumCombos - 1; while ( low <= high ) { const int mid = ( low + high ) >> 1; const OptimalComboScheduleFileRecord_t &combo = pCombos[mid]; if ( nComboHash == combo.m_nComboHash ) { if ( combo.m_nOptSchedule == OptimalComboScheduleFileRecord_t::cDefaultScheduleIndex ) { nBestSchedule = -1; nBestSeed = 0; } else { nBestSchedule = combo.m_nOptSchedule; nBestSeed = combo.m_nOptSeed; } bestStatistics.nResult = SCESP_OK; bestStatistics.nCycles = combo.m_nOptCycles; bestStatistics.nRRegisters = 100; // bogus value - shouldn't matter bestStatistics.nThroughput = 1; // bogus value - shouldn't matter return true; } else if ( nComboHash < combo.m_nComboHash ) { high = mid - 1; } else { low = mid + 1; } } return false; } private: void Lock() { EnterCriticalSection( &m_CS ); } void Unlock() { LeaveCriticalSection( &m_CS ); } CRITICAL_SECTION m_CS; std::vector< uint8 > g_OptimalCombos; bool g_bTriedToLoadOptimalCombos; }; class CCompiledShader { // Purposely undefined. CCompiledShader( const CCompiledShader & ); CCompiledShader& operator= ( const CCompiledShader & ); public: CCompiledShader() : m_pShader( NULL ), m_pErrorMsgs( NULL ), m_last_hres( E_FAIL ), m_nSchedule( -1 ), m_nSeed( 0 ), m_nOptLevel( 1 ) { memset( &m_Statistics, 0, sizeof( m_Statistics ) ); m_Statistics.nResult = SCESP_ERROR_UNKNOWN; } ~CCompiledShader() { Clear(); } void Clear() { if ( m_pShader ) { m_pShader->Release(); m_pShader = NULL; } if ( m_pErrorMsgs ) { m_pErrorMsgs->Release(); m_pErrorMsgs = NULL; } memset( &m_Statistics, 0, sizeof( m_Statistics ) ); m_Statistics.nResult = SCESP_ERROR_UNKNOWN; m_last_hres = E_FAIL; m_nSchedule = -1; m_nSeed = 0; m_nOptLevel = 1; } LPD3DXBUFFER GetShader() { return m_pShader; } LPD3DXBUFFER GetErrorMsgs() { return m_pErrorMsgs; } LPD3DXBUFFER GetShaderAndReleaseOwnership() { LPD3DXBUFFER pShader = m_pShader; m_pShader = NULL; return pShader; } LPD3DXBUFFER GetErrorMsgsAndReleaseOwnership() { LPD3DXBUFFER pErrorMsgs = m_pErrorMsgs; m_pErrorMsgs = NULL; return pErrorMsgs; } const SceSpMeasurementResult &GetStatistics() const { return m_Statistics; } HRESULT GetLastHRESULT() const { return m_last_hres; } int GetSchedule() const { return m_nSchedule; } int GetSeed() const { return m_nSeed; } int GetOptLevel() const { return m_nOptLevel; } // Proxied routines HRESULT Compile( LPCSTR pSrcFile, CONST D3DXMACRO* pDefines, LPD3DXINCLUDE pInclude, LPCSTR pFunctionName, LPCSTR pProfile, DWORD Flags, int nRandSched = -1, int nRandSeed = -1, int nOptLevel = 1, int *pDbgStatusIndex = NULL ) { Clear(); m_nSchedule = nRandSched; m_nSeed = nRandSeed; m_nOptLevel = nOptLevel; LPD3DXBUFFER *ppShader = &m_pShader; LPD3DXBUFFER *ppErrorMsgs = &m_pErrorMsgs; bool bFragmentShader = false; const char * pRsxProfile = pProfile; if ( *pProfile == 'v' ) // guessing it's a vertex shader profile { pRsxProfile = "sce_vp_rsx"; } else if ( *pProfile == 'p' ) // guessing it's a pixel shader profile { pRsxProfile = "sce_fp_rsx"; bFragmentShader = true; } if ( !pInclude ) pInclude = &s_incDxImpl; // Open the top-level file via our include interface LPCVOID lpcvData; UINT numBytes; HRESULT hr = pInclude->Open( ( D3DXINCLUDE_TYPE ) 0, pSrcFile, NULL, &lpcvData, &numBytes ); if ( FAILED( hr ) ) { m_last_hres = hr; return hr; } LPCSTR pShaderData = ( LPCSTR ) lpcvData; g_pInclude = pInclude; CGCinclude incWrap; incWrap.close = CgcIncludeClose; incWrap.open = CgcIncludeOpen; std::vector options; if ( pDefines ) { for ( const D3DXMACRO * pMacro = pDefines; pMacro->Name; pMacro++ ) { std::string strOpt = "-D"; strOpt += pMacro->Name; if( pMacro->Definition && *pMacro->Definition ) { if ( !strncmp( pMacro->Name, "PS3REGCOUNT", 11 ) ) { options.push_back( "-regcount" ); options.push_back( pMacro->Name + 11 ); continue; } // Common case: strOpt += "="; strOpt += pMacro->Definition; } options.push_back( strOpt ); } } char buf[512]; if ( ( bFragmentShader ) && ( nRandSched >= 1 ) ) { options.push_back( "-po" ); sprintf( buf, "randomSched=%i", nRandSched ); options.push_back( std::string( buf ) ); options.push_back( "-po" ); sprintf( buf, "randomSeed=%i", nRandSeed ); options.push_back( std::string( buf ) ); } options.push_back( "-inline" ); options.push_back( "all" ); options.push_back( "-fastmath" ); sprintf( buf, "-O%i", nOptLevel ); options.push_back( buf ); const char ** ppOptions = (const char**)stackalloc( sizeof(char*) * ( options.size() + 1 ) ); for( uint i = 0; i < options.size(); ++i ) ppOptions[i] = options[i].c_str(); ppOptions[options.size()] = NULL; DebugLog("%s:%s/%s", pSrcFile, pProfile, pRsxProfile ); CgContextWrapper cgcc; BlobAdaptor *pCompiledShader = new BlobAdaptor(), *pMessages = new BlobAdaptor(), *asciiOutput = new BlobAdaptor(); int status = sceCgcCompileString( cgcc, pShaderData, pRsxProfile, pFunctionName, ppOptions, pCompiledShader->m_bin, pMessages->m_bin, asciiOutput->m_bin, &incWrap ); if ( ( !status ) && ( pCompiledShader ) && ( pCompiledShader->m_bin ) ) { const char* optStr[] = { NULL }; char *pBinData = static_cast< char * >( sceCgcGetBinData( pCompiledShader->m_bin ) ); int nBinSize = sceCgcGetBinSize( pCompiledShader->m_bin ); SceSpResult res = sceShaderPerfMeasure( pBinData, nBinSize, optStr, &m_Statistics ); if ( res != SCESP_OK ) { DebugLog( "sceShaderPerfMeasure failed with status %i", res ); if ( pDbgStatusIndex ) { *pDbgStatusIndex = -1; } } } pCompiledShader->Bake(); *ppShader = pCompiledShader; *ppErrorMsgs = new BlobAdaptor( pMessages, asciiOutput ); #ifdef _DEBUG if( status ) DebugLog( "Error %d:\n%s\n%s", status, pMessages->GetBufferPointer(), asciiOutput->GetBufferPointer() ); else DebugLog( "Success %d bytes", pCompiledShader->GetBufferSize() ); #endif pMessages->Release(); asciiOutput->Release(); hr = ( status == SCECGC_OK ? S_OK : 0x80000005 ); // Close the file pInclude->Close( lpcvData ); m_last_hres = hr; return hr; } CCompiledShader &TakeOwnership( CCompiledShader &src ) { if ( this == &src ) return *this; Clear(); m_last_hres = src.m_last_hres; m_pShader = src.m_pShader; src.m_pShader = NULL; m_pErrorMsgs = src.m_pErrorMsgs; src.m_pErrorMsgs = NULL; m_Statistics = src.m_Statistics; m_nSchedule = src.m_nSchedule; m_nSeed = src.m_nSeed; m_nOptLevel = src.m_nOptLevel; return *this; } private: HRESULT m_last_hres; LPD3DXBUFFER m_pShader; LPD3DXBUFFER m_pErrorMsgs; SceSpMeasurementResult m_Statistics; int m_nSchedule; int m_nSeed; int m_nOptLevel; }; COptimalComboFile g_OptimalComboFile; // Proxied routines #pragma comment(linker, "/EXPORT:Proxy_D3DXCompileShaderFromFile=?Proxy_D3DXCompileShaderFromFile@@YGJPBDPBU_D3D_SHADER_MACRO@@PAUID3DInclude@@00KPAPAUID3D10Blob@@3PAPAX@Z") HRESULT WINAPI Proxy_D3DXCompileShaderFromFile(LPCSTR pSrcFile, CONST D3DXMACRO* pDefines, LPD3DXINCLUDE pInclude, LPCSTR pFunctionName, LPCSTR pProfile, DWORD Flags, LPD3DXBUFFER* ppShader, LPD3DXBUFFER* ppErrorMsgs, LPD3DXCONSTANTTABLE* ppConstantTable ) { *ppShader = NULL; *ppErrorMsgs = NULL; if ( ppConstantTable ) *ppConstantTable = NULL; static bool bInitializedShaderPerfLib; if ( !bInitializedShaderPerfLib ) { bInitializedShaderPerfLib = true; sceShaderPerfInit(); } if ( *pProfile == 'v' ) { CCompiledShader compiledShader; HRESULT hres = compiledShader.Compile( pSrcFile, pDefines, pInclude, pFunctionName, pProfile, Flags, -1, -1, 1 ); if ( FAILED( hres ) ) { *ppErrorMsgs = compiledShader.GetErrorMsgsAndReleaseOwnership(); return hres; } *ppShader = compiledShader.GetShaderAndReleaseOwnership(); return S_OK; } const uint nStartTime = GetTickCount(); const uint64 nComboHash = ComputeComboHash( pSrcFile, pDefines, pFunctionName ); char szOptimalScheduleFile[MAX_PATH]; const bool bUseOptimalSchedulingFile = GetEnvironmentVariableA( "PS3OPTIMALSCHEDULESFILE", szOptimalScheduleFile, sizeof( szOptimalScheduleFile ) ) && szOptimalScheduleFile[0]; char szFindOptimalSchedulesValue[MAX_PATH]; const bool bFindOptimalScheduling = !bUseOptimalSchedulingFile && ( GetEnvironmentVariableA( "PS3FINDOPTIMALSCHEDULES", szFindOptimalSchedulesValue, sizeof( szFindOptimalSchedulesValue ) ) && ( szFindOptimalSchedulesValue[0] == '1' ) ); ShaderSchedulerParamSource_t nShaderSchedulerSourceIndex = SHADER_SCHEDULER_PARAM_SOURCE_UNOPTIMIZED; SceSpMeasurementResult trainedScheduleResults; memset( &trainedScheduleResults, 0, sizeof( trainedScheduleResults ) ); int nTrainedSchedule = -1; int nTrainedSeed = 0; int nDbgStatusIndex = 0; if ( ( bUseOptimalSchedulingFile ) && ( g_OptimalComboFile.Load( szOptimalScheduleFile ) ) ) { if ( g_OptimalComboFile.GetOptimalScheduleForCombo( nComboHash, nTrainedSchedule, nTrainedSeed, trainedScheduleResults ) ) { nShaderSchedulerSourceIndex = SHADER_SCHEDULER_PARAM_SOURCE_FROM_SCHEDULER_FILE; nDbgStatusIndex = 1; } } uint nTotalCompiles = 0; CCompiledShader defaultShader; nTotalCompiles++; HRESULT hres = defaultShader.Compile( pSrcFile, pDefines, pInclude, pFunctionName, pProfile, Flags, nTrainedSchedule, nTrainedSeed, CGC_COMPILER_OPTIMIZATION_LEVEL, &nDbgStatusIndex ); if ( FAILED( hres ) ) { *ppErrorMsgs = defaultShader.GetErrorMsgsAndReleaseOwnership(); return hres; } CCompiledShader bestShader; bestShader.TakeOwnership( defaultShader ); if ( ( nShaderSchedulerSourceIndex == SHADER_SCHEDULER_PARAM_SOURCE_FROM_SCHEDULER_FILE ) && ( defaultShader.GetStatistics().nCycles > trainedScheduleResults.nCycles ) ) { // The optimal schedule params stored in the ps3optimalschedules.bin file didn't produce the expected results (the shader was modified since the // schedules where optimized), so try falling back to the compiler's default scheduling. (Which may not be any better, but at least we'll never get worse than the default schedule.) nTotalCompiles++; CCompiledShader alternateShader; HRESULT hres = alternateShader.Compile( pSrcFile, pDefines, pInclude, pFunctionName, pProfile, Flags, -1, 0, CGC_COMPILER_OPTIMIZATION_LEVEL, &nDbgStatusIndex ); if ( FAILED( hres ) ) { *ppErrorMsgs = alternateShader.GetErrorMsgsAndReleaseOwnership(); return hres; } if ( alternateShader.GetStatistics() < bestShader.GetStatistics() ) { nShaderSchedulerSourceIndex = SHADER_SCHEDULER_PARAM_SOURCE_UNOPTIMIZED_FALLBACK; bestShader.TakeOwnership( alternateShader ); nDbgStatusIndex = 2; } } SceSpMeasurementResult origStatistics( bestShader.GetStatistics() ); // Don't bother trying to optimize tiny shaders, the potential gain is not worth it (and they're probably fill bound anyway). if ( ( bFindOptimalScheduling ) && ( ( bestShader.GetStatistics().nCycles > 5 ) || ( bestShader.GetStatistics().nRRegisters > 2 ) ) ) { // Important: Watch the ranges of rand_schedule and rand_seed. See COMBO_SEED_BITS and COMBO_SCHEDULE_BITS. for ( int nRandSchedIndex = 0; nRandSchedIndex < NUM_RANDOM_SCHEDULE_VALUES; ++nRandSchedIndex ) { const int nRandSched = g_nRandSched[nRandSchedIndex]; for ( int nTrial = 0; nTrial < NUM_RANDOM_SCHEDULE_SEEDS; ++nTrial ) { const int nRandSeed = 10 + nTrial * 8; nTotalCompiles++; CCompiledShader trialShader; HRESULT hres = trialShader.Compile( pSrcFile, pDefines, pInclude, pFunctionName, pProfile, Flags, nRandSched, nRandSeed, CGC_COMPILER_OPTIMIZATION_LEVEL, &nDbgStatusIndex ); if ( FAILED( hres ) ) { *ppErrorMsgs = trialShader.GetErrorMsgsAndReleaseOwnership(); return hres; } if ( trialShader.GetStatistics() < bestShader.GetStatistics() ) { bestShader.TakeOwnership( trialShader ); nShaderSchedulerSourceIndex = SHADER_SCHEDULER_PARAM_SOURCE_FOUND_OPTIMAL; nDbgStatusIndex = 3; } } } } *ppShader = bestShader.GetShaderAndReleaseOwnership(); const uint nEndTime = GetTickCount(); double flTotalTime = ( nEndTime - nStartTime ) * .001f; flTotalTime; UpdateCompileLogFile( pSrcFile, nComboHash, origStatistics, bestShader.GetStatistics(), bestShader.GetSchedule(), bestShader.GetSeed(), nShaderSchedulerSourceIndex, nDbgStatusIndex ); #if 0 printf( "Orig cycles/registers: %u (%u), Optimized cycles/registers: %u (%u), Total compiles: %u, ms per compile: %f\n", origStatistics.nCycles, origStatistics.nRRegisters, bestShader.GetStatistics().nCycles, bestShader.GetStatistics().nRRegisters, nTotalCompiles, 1000.0f * ( flTotalTime / nTotalCompiles ) ); #endif return S_OK; }