//============ Copyright (c) Valve Corporation, All rights reserved. ============
//
// cglmbuffer.cpp 
//
//===============================================================================

#include "togl/rendermechanism.h"

// memdbgon -must- be the last include file in a .cpp file.
#include "tier0/memdbgon.h"

// LINUXTODO : took out cmdline here
bool g_bUsePseudoBufs = false; //( Plat_GetCommandLineA() ) ? ( strstr( Plat_GetCommandLineA(), "-gl_enable_pseudobufs" ) != NULL ) : false;
#ifdef OSX
// Significant perf degradation on some OSX parts if static buffers not disabled
bool g_bDisableStaticBuffer = true;
#else
bool g_bDisableStaticBuffer = false; //( Plat_GetCommandLineA() ) ? ( strstr( Plat_GetCommandLineA(), "-gl_disable_static_buffer" ) != NULL ) : false;
#endif

// http://www.opengl.org/registry/specs/ARB/vertex_buffer_object.txt
// http://www.opengl.org/registry/specs/ARB/pixel_buffer_object.txt

// gl_bufmode: zero means we mark all vertex/index buffers static

// non zero means buffers are initially marked static..
// ->but can shift to dynamic upon first 'discard' (orphaning)

// #define REPORT_LOCK_TIME	0

ConVar gl_bufmode( "gl_bufmode", "1" );

char ALIGN16 CGLMBuffer::m_StaticBuffers[ GL_MAX_STATIC_BUFFERS ][ GL_STATIC_BUFFER_SIZE ] ALIGN16_POST;
bool CGLMBuffer::m_bStaticBufferUsed[ GL_MAX_STATIC_BUFFERS ];

extern bool g_bNullD3DDevice; 

//===========================================================================//

static uint gMaxPersistentOffset[kGLMNumBufferTypes] =
{
	0,
	0,
	0,
	0
};
CON_COMMAND( gl_persistent_buffer_max_offset, "" )
{
	ConMsg( "OpenGL Persistent buffer max offset :\n" );
	ConMsg( "  Vertex buffer : %d bytes (%f MB) \n", gMaxPersistentOffset[kGLMVertexBuffer], gMaxPersistentOffset[kGLMVertexBuffer] / (1024.0f*1024.0f) );
	ConMsg( "  Index buffer : %d bytes (%f MB) \n", gMaxPersistentOffset[kGLMIndexBuffer], gMaxPersistentOffset[kGLMIndexBuffer] / (1024.0f*1024.0f) );
	ConMsg( "  Uniform buffer : %d bytes (%f MB) \n", gMaxPersistentOffset[kGLMUniformBuffer], gMaxPersistentOffset[kGLMUniformBuffer] / (1024.0f*1024.0f) );
	ConMsg( "  Pixel buffer : %d bytes (%f MB) \n", gMaxPersistentOffset[kGLMPixelBuffer], gMaxPersistentOffset[kGLMPixelBuffer] / (1024.0f*1024.0f) );
}

CPersistentBuffer::CPersistentBuffer()
:
	m_nSize( 0 )
	, m_nHandle( 0 )
	, m_pImmutablePersistentBuf( NULL )
	, m_nOffset( 0 )
#ifdef HAVE_GL_ARB_SYNC
	, m_nSyncObj( 0 )
#endif
{}

CPersistentBuffer::~CPersistentBuffer()
{
	Deinit();
}

void CPersistentBuffer::Init( EGLMBufferType type,uint nSize )
{
	Assert( gGL->m_bHave_GL_ARB_buffer_storage );
	Assert( gGL->m_bHave_GL_ARB_map_buffer_range );
	
	m_nSize		= nSize;
	m_nOffset	= 0;
	m_type		= type;
	
	switch ( type )
	{
	case kGLMVertexBuffer:	m_buffGLTarget = GL_ARRAY_BUFFER_ARB; break;
	case kGLMIndexBuffer:	m_buffGLTarget = GL_ELEMENT_ARRAY_BUFFER_ARB; break;

	default: Assert( nSize == 0 );
	}
	
	if ( m_nSize > 0 )
	{
		gGL->glGenBuffersARB( 1, &m_nHandle );
		gGL->glBindBufferARB( m_buffGLTarget, m_nHandle );

		// Create persistent immutable buffer that we will permanently map.  This buffer can be written from any thread (not just
		// the renderthread)
		gGL->glBufferStorage( m_buffGLTarget, m_nSize, (const GLvoid *)NULL, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT ); // V_GL_REQ: GL_ARB_buffer_storage, GL_ARB_map_buffer_range, GL_VERSION_4_4

		// Map the buffer for all of eternity.  Pointer can be used from multiple threads.
		m_pImmutablePersistentBuf = gGL->glMapBufferRange( m_buffGLTarget, 0, m_nSize, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT ); // V_GL_REQ: GL_ARB_map_buffer_range, GL_ARB_buffer_storage, GL_VERSION_4_4
		Assert( m_pImmutablePersistentBuf != NULL );
	}
}

void CPersistentBuffer::Deinit()
{
	if ( !m_pImmutablePersistentBuf )
	{
		return;
	}

	BlockUntilNotBusy();

	gGL->glBindBufferARB( m_buffGLTarget, m_nHandle );
	gGL->glUnmapBuffer( m_buffGLTarget );
	gGL->glBindBufferARB( m_buffGLTarget, 0 );

	gGL->glDeleteBuffersARB( 1, &m_nHandle );
	
	m_nSize		= 0;
	m_nHandle	= 0;
	m_nOffset	= 0;
	m_pImmutablePersistentBuf = NULL;
}

void CPersistentBuffer::InsertFence()
{
#ifdef HAVE_GL_ARB_SYNC
	if (m_nSyncObj)
	{
		gGL->glDeleteSync( m_nSyncObj );
	}

	m_nSyncObj = gGL->glFenceSync( GL_SYNC_GPU_COMMANDS_COMPLETE, 0 );
#endif
}

void CPersistentBuffer::BlockUntilNotBusy()
{
#ifdef HAVE_GL_ARB_SYNC
	if (m_nSyncObj)
	{
		gGL->glClientWaitSync( m_nSyncObj, GL_SYNC_FLUSH_COMMANDS_BIT, 3000000000000ULL );

		gGL->glDeleteSync( m_nSyncObj );

		m_nSyncObj = 0;
	}
#endif
	m_nOffset = 0;
}

void CPersistentBuffer::Append( uint nSize )
{
	m_nOffset += nSize;
	Assert( m_nOffset <= m_nSize );

	gMaxPersistentOffset[m_type] = Max( m_nOffset, gMaxPersistentOffset[m_type] );
}

//===========================================================================//

#if GL_ENABLE_INDEX_VERIFICATION

CGLMBufferSpanManager::CGLMBufferSpanManager() : 
	m_pCtx( NULL ),
	m_nBufType( kGLMVertexBuffer ),
	m_nBufSize( 0 ),
	m_bDynamic( false ),
	m_nSpanEndMax( -1 ),
	m_nNumAllocatedBufs( 0 ),
	m_nTotalBytesAllocated( 0 )
{
}

CGLMBufferSpanManager::~CGLMBufferSpanManager()
{
	Deinit();
}

void CGLMBufferSpanManager::Init( GLMContext *pContext, EGLMBufferType nBufType, uint nInitialCapacity, uint nBufSize, bool bDynamic )
{
	Assert( ( nBufType == kGLMIndexBuffer ) || ( nBufType == kGLMVertexBuffer ) );

	m_pCtx = pContext;
	m_nBufType = nBufType;
	
	m_nBufSize = nBufSize;
	m_bDynamic = bDynamic;

	m_ActiveSpans.EnsureCapacity( nInitialCapacity );
	m_DeletedSpans.EnsureCapacity( nInitialCapacity );
	m_nSpanEndMax = -1;

	m_nNumAllocatedBufs = 0;
	m_nTotalBytesAllocated = 0;
}

bool CGLMBufferSpanManager::AllocDynamicBuf( uint nSize, GLDynamicBuf_t &buf )
{
	buf.m_nGLType = GetGLBufType();
	buf.m_nActualBufSize = nSize;
	buf.m_nHandle = 0;
	buf.m_nSize = nSize;

	m_nNumAllocatedBufs++;
	m_nTotalBytesAllocated += buf.m_nActualBufSize;

	return true;
}

void CGLMBufferSpanManager::ReleaseDynamicBuf( GLDynamicBuf_t &buf )
{
	Assert( m_nNumAllocatedBufs > 0 );
	m_nNumAllocatedBufs--;

	Assert( m_nTotalBytesAllocated >= (int)buf.m_nActualBufSize );
	m_nTotalBytesAllocated -= buf.m_nActualBufSize;
}

void CGLMBufferSpanManager::Deinit()
{
	if ( !m_pCtx )
		return;

	for ( int i = 0; i < m_ActiveSpans.Count(); i++ )
	{
		if ( m_ActiveSpans[i].m_bOriginalAlloc )
			ReleaseDynamicBuf( m_ActiveSpans[i].m_buf );
	}
	m_ActiveSpans.SetCountNonDestructively( 0 );

	for ( int i = 0; i < m_DeletedSpans.Count(); i++ )
		ReleaseDynamicBuf( m_DeletedSpans[i].m_buf );

	m_DeletedSpans.SetCountNonDestructively( 0 );

	m_pCtx->BindGLBufferToCtx( GetGLBufType(), NULL, true );

	m_nSpanEndMax = -1;
	m_pCtx = NULL;

	Assert( !m_nNumAllocatedBufs );
	Assert( !m_nTotalBytesAllocated );
}

void CGLMBufferSpanManager::DiscardAllSpans()
{
	for ( int i = 0; i < m_ActiveSpans.Count(); i++ )
	{
		if ( m_ActiveSpans[i].m_bOriginalAlloc )
			ReleaseDynamicBuf( m_ActiveSpans[i].m_buf );
	}
	m_ActiveSpans.SetCountNonDestructively( 0 );

	for ( int i = 0; i < m_DeletedSpans.Count(); i++ )
		ReleaseDynamicBuf( m_DeletedSpans[i].m_buf );

	m_DeletedSpans.SetCountNonDestructively( 0 );

	m_nSpanEndMax = -1;

	Assert( !m_nNumAllocatedBufs );
	Assert( !m_nTotalBytesAllocated );
}

// TODO: Add logic to detect incorrect usage of bNoOverwrite.
CGLMBufferSpanManager::ActiveSpan_t *CGLMBufferSpanManager::AddSpan( uint nOffset, uint nMaxSize, uint nActualSize, bool bDiscard, bool bNoOverwrite  )
{
	(void)bDiscard;
	(void)bNoOverwrite;

	const uint nStart = nOffset;
	const uint nSize = nActualSize;
	const uint nEnd = nStart + nSize;

	GLDynamicBuf_t newDynamicBuf;
	if ( !AllocDynamicBuf( nSize, newDynamicBuf ) )
	{
		DXABSTRACT_BREAK_ON_ERROR();
		return NULL;
	}

	if ( (int)nStart < m_nSpanEndMax )
	{
		// Lock region potentially overlaps another previously locked region (since the last discard) - this is a very rarely (if ever) taken path in Source1 games.
		int i = 0;
		while ( i < m_ActiveSpans.Count() )
		{
			ActiveSpan_t &existingSpan = m_ActiveSpans[i];
			if ( ( nEnd <= existingSpan.m_nStart ) || ( nStart >= existingSpan.m_nEnd ) )
			{
				i++;
				continue;
			}

			Warning( "GL performance warning: AddSpan() at offset %u max size %u actual size %u, on a %s %s buffer of total size %u, overwrites an existing active lock span at offset %u size %u!\n", 
				nOffset, nMaxSize, nActualSize, 
				m_bDynamic ? "dynamic" : "static", ( m_nBufType == kGLMVertexBuffer ) ? "vertex" : "index", m_nBufSize, 
				existingSpan.m_nStart, existingSpan.m_nEnd - existingSpan.m_nStart );
			
			if ( ( nStart <= existingSpan.m_nStart ) && ( nEnd >= existingSpan.m_nEnd ) )
			{
				if ( existingSpan.m_bOriginalAlloc )
				{
					// New span totally covers existing span
					// Can't immediately delete the span's buffer because it could be referred to by another (child) span.
					m_DeletedSpans.AddToTail( existingSpan );
				}

				// Delete span
				m_ActiveSpans[i] = m_ActiveSpans[ m_ActiveSpans.Count() - 1 ];
				m_ActiveSpans.SetCountNonDestructively( m_ActiveSpans.Count() - 1 );
				continue;
			}

			// New span does NOT fully cover the existing span (partial overlap)
			if ( nStart < existingSpan.m_nStart )
			{
				// New span starts before existing span, but ends somewhere inside, so shrink it (start moves "right")
				existingSpan.m_nStart = nEnd;
			}
			else if ( nEnd > existingSpan.m_nEnd )
			{
				// New span ends after existing span, but starts somewhere inside (end moves "left")
				existingSpan.m_nEnd = nStart;
			}
			else //if ( ( nStart >= existingSpan.m_nStart ) && ( nEnd <= existingSpan.m_nEnd ) )
			{
				// New span lies inside of existing span
				if ( nStart == existingSpan.m_nStart )
				{
					// New span begins inside the existing span (start moves "right")
					existingSpan.m_nStart = nEnd;
				}
				else
				{
					if ( nEnd < existingSpan.m_nEnd )
					{
						// New span is completely inside existing span
						m_ActiveSpans.AddToTail( ActiveSpan_t( nEnd, existingSpan.m_nEnd, existingSpan.m_buf, false ) );
					}

					existingSpan.m_nEnd = nStart;
				}
			}

			Assert( existingSpan.m_nStart < existingSpan.m_nEnd );
			i++;
		}
	}

	newDynamicBuf.m_nLockOffset = nStart;
	newDynamicBuf.m_nLockSize = nSize;

	m_ActiveSpans.AddToTail( ActiveSpan_t( nStart, nEnd, newDynamicBuf, true ) );
	m_nSpanEndMax = MAX( m_nSpanEndMax, (int)nEnd );

	return &m_ActiveSpans.Tail();
}

bool CGLMBufferSpanManager::IsValid( uint nOffset, uint nSize ) const
{
	const uint nEnd = nOffset + nSize;
	
	int nTotalBytesRemaining = nSize;

	for ( int i = m_ActiveSpans.Count() - 1; i >= 0; --i )
	{
		const ActiveSpan_t &span = m_ActiveSpans[i];
		
		if ( span.m_nEnd <= nOffset )
			continue;
		if ( span.m_nStart >= nEnd )
			continue;

		uint nIntersectStart = MAX( span.m_nStart, nOffset );
		uint nIntersectEnd = MIN( span.m_nEnd, nEnd );
		Assert( nIntersectStart <= nIntersectEnd );

		nTotalBytesRemaining -= ( nIntersectEnd - nIntersectStart );
		Assert( nTotalBytesRemaining >= 0 );
		if ( nTotalBytesRemaining <= 0 )
			break;
	}

	return nTotalBytesRemaining == 0;
}
#endif // GL_ENABLE_INDEX_VERIFICATION

// glBufferSubData() with a max size limit, to work around NVidia's threaded driver limits (anything > than roughly 256KB triggers a sync with the server thread).
void glBufferSubDataMaxSize( GLenum target, GLintptr offset, GLsizeiptr size, const GLvoid *data, uint nMaxSizePerCall )
{
#if TOGL_SUPPORT_NULL_DEVICE
	if ( g_bNullD3DDevice ) return;
#endif

	uint nBytesLeft = size;
	uint nOfs = 0;
	while ( nBytesLeft )
	{
		uint nBytesToCopy = MIN( nMaxSizePerCall, nBytesLeft );

		gGL->glBufferSubData( target, offset + nOfs, nBytesToCopy, static_cast<const unsigned char *>( data ) + nOfs );

		nBytesLeft -= nBytesToCopy;
		nOfs += nBytesToCopy;
	}
}

CGLMBuffer::CGLMBuffer( GLMContext *pCtx, EGLMBufferType type, uint size, uint options )
{
	m_pCtx = pCtx;
	m_type = type;
	
	m_bDynamic = ( options & GLMBufferOptionDynamic ) != 0;
				
	switch ( m_type )
	{
		case kGLMVertexBuffer:	m_buffGLTarget = GL_ARRAY_BUFFER_ARB; break;
		case kGLMIndexBuffer:	m_buffGLTarget = GL_ELEMENT_ARRAY_BUFFER_ARB; break;
		case kGLMUniformBuffer:	m_buffGLTarget = GL_UNIFORM_BUFFER_EXT; break;
		case kGLMPixelBuffer:	m_buffGLTarget = GL_PIXEL_UNPACK_BUFFER_ARB; break;
		
		default: Assert(!"Unknown buffer type" ); DXABSTRACT_BREAK_ON_ERROR();
	}
				
	m_nSize = size;
	m_nActualSize = size;
	m_bMapped = false;
	m_pLastMappedAddress = NULL;

	m_pStaticBuffer = NULL;
	m_nPinnedMemoryOfs = -1;
	m_nPersistentBufferStartOffset = 0;
	m_bUsingPersistentBuffer = false;

	m_bEnableAsyncMap = false;
	m_bEnableExplicitFlush = false;
	m_dirtyMinOffset = m_dirtyMaxOffset = 0;								// adjust/grow on lock, clear on unlock

	m_pCtx->CheckCurrent();
	m_nRevision = rand();
		
	m_pPseudoBuf = NULL;
	m_pActualPseudoBuf = NULL;

	m_bPseudo = false;
		
#if GL_ENABLE_UNLOCK_BUFFER_OVERWRITE_DETECTION
	m_bPseudo = true;
#endif

#if GL_ENABLE_INDEX_VERIFICATION
	m_BufferSpanManager.Init( m_pCtx, m_type, 512, m_nSize, m_bDynamic );
		
	if ( m_type == kGLMIndexBuffer )
		m_bPseudo = true;
#endif
	
	if ( g_bUsePseudoBufs && m_bDynamic )
	{
		m_bPseudo = true;
	}
			
	if ( m_bPseudo )
	{
		m_nHandle = 0;		

#if GL_ENABLE_UNLOCK_BUFFER_OVERWRITE_DETECTION
		m_nDirtyRangeStart = 0xFFFFFFFF;
		m_nDirtyRangeEnd = 0;

		m_nActualSize = ALIGN_VALUE( ( m_nSize + sizeof( uint32 ) ), 4096 );
		m_pPseudoBuf = m_pActualPseudoBuf = (char *)VirtualAlloc( NULL, m_nActualSize, MEM_COMMIT, PAGE_READWRITE );
		if ( !m_pPseudoBuf )
		{
			Error( "VirtualAlloc() failed!\n" );
		}

		for ( uint i = 0; i < m_nActualSize / sizeof( uint32 ); i++ )
		{
			reinterpret_cast< uint32 * >( m_pPseudoBuf )[i] = 0xDEADBEEF;
		}

		DWORD nOldProtect;
		BOOL bResult = VirtualProtect( m_pActualPseudoBuf, m_nActualSize, PAGE_READONLY, &nOldProtect );
		if ( !bResult )
		{
			Error( "VirtualProtect() failed!\n" );
		}
#else
		m_nActualSize = size + 15;
		m_pActualPseudoBuf = (char*)malloc( m_nActualSize );
		m_pPseudoBuf = (char*)(((intp)m_pActualPseudoBuf + 15) & ~15);
#endif
		
		m_pCtx->BindBufferToCtx( m_type, NULL );		// exit with no buffer bound
	}
	else
	{
		gGL->glGenBuffersARB( 1, &m_nHandle );

		m_pCtx->BindBufferToCtx( m_type, this );	// causes glBindBufferARB

		// buffers start out static, but if they get orphaned and gl_bufmode is non zero,
		// then they will get flipped to dynamic.
		
		GLenum hint = GL_STATIC_DRAW_ARB;
		switch (m_type)
		{
			case kGLMVertexBuffer:	hint = m_bDynamic ? GL_DYNAMIC_DRAW_ARB : GL_STATIC_DRAW_ARB; break;
			case kGLMIndexBuffer:	hint = m_bDynamic ? GL_DYNAMIC_DRAW_ARB : GL_STATIC_DRAW_ARB; break;
			case kGLMUniformBuffer:	hint = GL_DYNAMIC_DRAW_ARB; break;
			case kGLMPixelBuffer:	hint = m_bDynamic ? GL_DYNAMIC_DRAW_ARB : GL_STATIC_DRAW_ARB; break;
			
			default: Assert(!"Unknown buffer type" ); DXABSTRACT_BREAK_ON_ERROR();
		}

		gGL->glBufferDataARB( m_buffGLTarget, m_nSize, (const GLvoid*)NULL, hint );	// may ultimately need more hints to set the usage correctly (esp for streaming)

		SetModes( false, true, true );

		m_pCtx->BindBufferToCtx( m_type, NULL );	// unbind me
	}
}

CGLMBuffer::~CGLMBuffer( )
{
	m_pCtx->CheckCurrent();
	
	if ( m_bPseudo )
	{
#if GL_ENABLE_UNLOCK_BUFFER_OVERWRITE_DETECTION
		BOOL bResult = VirtualFree( m_pActualPseudoBuf, 0, MEM_RELEASE );
		if ( !bResult )
		{
			Error( "VirtualFree() failed!\n" );
		}
#else
		free( m_pActualPseudoBuf );
#endif
		m_pActualPseudoBuf = NULL;
		m_pPseudoBuf = NULL;
	}
	else
	{
		gGL->glDeleteBuffersARB( 1, &m_nHandle );
	}
	
	m_pCtx = NULL;
	m_nHandle = 0;
		
	m_pLastMappedAddress = NULL;

#if GL_ENABLE_INDEX_VERIFICATION
	m_BufferSpanManager.Deinit();
#endif
}

void CGLMBuffer::SetModes( bool bAsyncMap, bool bExplicitFlush, bool bForce )
{
	// assumes buffer is bound. called by constructor and by Lock.

	if ( m_bPseudo )
	{
		// ignore it...
	}
	else
	{
		if ( bForce || ( m_bEnableAsyncMap != bAsyncMap ) )
		{
			// note the sense of the parameter, it's TRUE if you *want* serialization, so for async you turn it to false.
			if ( ( gGL->m_bHave_GL_APPLE_flush_buffer_range ) && ( !gGL->m_bHave_GL_ARB_map_buffer_range ) )
			{
				gGL->glBufferParameteriAPPLE( m_buffGLTarget, GL_BUFFER_SERIALIZED_MODIFY_APPLE, bAsyncMap == false );
			}
			m_bEnableAsyncMap = bAsyncMap;
		}

		if ( bForce || ( m_bEnableExplicitFlush != bExplicitFlush ) )
		{
			// Note that the GL_ARB_map_buffer_range path handles this in the glMapBufferRange() call in Lock().
			// note the sense of the parameter, it's TRUE if you *want* auto-flush-on-unmap, so for explicit-flush, you turn it to false.
			if ( ( gGL->m_bHave_GL_APPLE_flush_buffer_range ) && ( !gGL->m_bHave_GL_ARB_map_buffer_range ) )
			{
				gGL->glBufferParameteriAPPLE( m_buffGLTarget, GL_BUFFER_FLUSHING_UNMAP_APPLE, bExplicitFlush == false );
			}
			m_bEnableExplicitFlush = bExplicitFlush;
		}
	}
}

#if GL_ENABLE_INDEX_VERIFICATION
bool CGLMBuffer::IsSpanValid( uint nOffset, uint nSize ) const
{
	return m_BufferSpanManager.IsValid( nOffset, nSize );
}
#endif

void CGLMBuffer::FlushRange( uint offset, uint size )
{
	if ( m_pStaticBuffer )
	{
	}
	else if ( m_bPseudo )
	{
		// nothing to do
	}
	else
	{
#ifdef REPORT_LOCK_TIME
		double flStart = Plat_FloatTime();
#endif

		// assumes buffer is bound.
		if ( gGL->m_bHave_GL_ARB_map_buffer_range )
		{
			gGL->glFlushMappedBufferRange( m_buffGLTarget, (GLintptr)( offset - m_dirtyMinOffset ), (GLsizeiptr)size );
		}
		else if ( gGL->m_bHave_GL_APPLE_flush_buffer_range )
		{
			gGL->glFlushMappedBufferRangeAPPLE( m_buffGLTarget, (GLintptr)offset, (GLsizeiptr)size );
		}
		
#ifdef REPORT_LOCK_TIME
		double flEnd = Plat_FloatTime();
		if ( flEnd - flStart > 5.0 / 1000.0 )
		{
			int nDelta = ( int )( ( flEnd - flStart ) * 1000 );
			if ( nDelta > 2 )
			{
				Msg( "**** " );
			}
			Msg( "glFlushMappedBufferRange Time %d: ( Name=%d BufSize=%d ) Target=%p Offset=%d FlushSize=%d\n", nDelta, m_nHandle, m_nSize, m_buffGLTarget, offset - m_dirtyMinOffset, size );
		}
#endif

		// If you don't have any extension support here, you'll flush the whole buffer on unmap. Performance loss, but it's still safe and correct.
	}
}

void CGLMBuffer::Lock( GLMBuffLockParams *pParams, char **pAddressOut )
{
#if GL_TELEMETRY_GPU_ZONES
	CScopedGLMPIXEvent glmPIXEvent( "CGLMBuffer::Lock" );
	g_TelemetryGPUStats.m_nTotalBufferLocksAndUnlocks++;
#endif

	char *resultPtr = NULL;
	
	if ( m_bMapped )
	{
		DXABSTRACT_BREAK_ON_ERROR();
		return;
	}
	
	m_pCtx->CheckCurrent();

	Assert( pParams->m_nSize );
	
	m_LockParams = *pParams;
	
	if ( pParams->m_nOffset >= m_nSize )
	{
		DXABSTRACT_BREAK_ON_ERROR();
		return;
	}
	
	if ( ( pParams->m_nOffset + pParams->m_nSize ) > m_nSize)
	{
		DXABSTRACT_BREAK_ON_ERROR();
		return;
	}

#if GL_ENABLE_INDEX_VERIFICATION
	if ( pParams->m_bDiscard )
	{
		m_BufferSpanManager.DiscardAllSpans();
	}
#endif

	m_pStaticBuffer = NULL;
	bool bUsingPersistentBuffer = false;

	uint padding = 0;
	if ( m_bDynamic && gGL->m_bHave_GL_ARB_buffer_storage )
	{
		// Compute padding to add to make sure the start offset is valid
		CPersistentBuffer *pTempBuffer = m_pCtx->GetCurPersistentBuffer( m_type );
		uint persistentBufferOffset = pTempBuffer->GetOffset();

		if (pParams->m_nOffset > persistentBufferOffset)
		{
			// Make sure the start offset if valid (adding padding to the persistent buffer)
			padding = pParams->m_nOffset - persistentBufferOffset;
		}
	}
	
	if ( m_bPseudo )
	{
		if ( pParams->m_bDiscard )
		{
			m_nRevision++;
		}

		// async map modes are a no-op
				
		// calc lock address
		resultPtr = m_pPseudoBuf + pParams->m_nOffset;

#if GL_ENABLE_UNLOCK_BUFFER_OVERWRITE_DETECTION
		BOOL bResult;
		DWORD nOldProtect;
		if ( pParams->m_bDiscard )
		{
			bResult = VirtualProtect( m_pActualPseudoBuf, m_nSize, PAGE_READWRITE, &nOldProtect );
			if ( !bResult )
			{
				Error( "VirtualProtect() failed!\n" );
			}

			m_nDirtyRangeStart = 0xFFFFFFFF;
			m_nDirtyRangeEnd = 0;

			for ( uint i = 0; i < m_nSize / sizeof( uint32 ); i++ )
			{
				reinterpret_cast< uint32 * >( m_pPseudoBuf )[i] = 0xDEADBEEF;
			}

			bResult = VirtualProtect( m_pActualPseudoBuf, m_nSize, PAGE_READONLY, &nOldProtect );
			if ( !bResult )
			{
				Error( "VirtualProtect() failed!\n" );
			}
		}
		uint nProtectOfs = m_LockParams.m_nOffset & 4095;
		uint nProtectEnd = ( m_LockParams.m_nOffset + m_LockParams.m_nSize + 4095 ) & ~4095;
		uint nProtectSize = nProtectEnd - nProtectOfs;
		bResult = VirtualProtect( m_pActualPseudoBuf + nProtectOfs, nProtectSize, PAGE_READWRITE, &nOldProtect );
		if ( !bResult )
		{
			Error( "VirtualProtect() failed!\n" );
		}
#endif
	}
	else if ( m_bDynamic && gGL->m_bHave_GL_ARB_buffer_storage && ( m_pCtx->GetCurPersistentBuffer( m_type )->GetBytesRemaining() >= ( pParams->m_nSize + padding ) ) )
	{
		CPersistentBuffer *pTempBuffer = m_pCtx->GetCurPersistentBuffer( m_type );

		// Make sure the start offset if valid (adding padding to the persistent buffer)
		pTempBuffer->Append( padding );

		uint persistentBufferOffset = pTempBuffer->GetOffset();
		uint startOffset = persistentBufferOffset - pParams->m_nOffset;

		if ( pParams->m_bDiscard || ( startOffset != m_nPersistentBufferStartOffset ) )
		{
			m_nRevision++;
			// Offset to be added to the vertex and index buffer when setting the vertex and index buffer (before drawing)
			// Since we are using a immutable buffer storage, the persistent buffer is actually bigger than
			// buffer size requested upon creation. We keep appending to the end of the persistent buffer 
			// and therefore need to keep track of the start of the actual buffer (in the persistent one)
			m_nPersistentBufferStartOffset = startOffset;

			//DevMsg( "Discard (%s): startOffset = %d\n", pParams->m_bDiscard ? "true" : "false", m_nPersistentBufferStartOffset );
		}

		resultPtr = static_cast<char*>(pTempBuffer->GetPtr()) + persistentBufferOffset;
		bUsingPersistentBuffer = true;

		//DevMsg( " --> buff=%x, startOffset=%d, paramsOffset=%d, persistOffset = %d\n", this, m_nPersistentBufferStartOffset, pParams->m_nOffset, persistentBufferOffset );
	}
#ifndef OSX
	else if ( m_bDynamic && gGL->m_bHave_GL_AMD_pinned_memory && ( m_pCtx->GetCurPinnedMemoryBuffer()->GetBytesRemaining() >= pParams->m_nSize ) )
	{
		if ( pParams->m_bDiscard )
		{
			m_nRevision++;
		}

		m_dirtyMinOffset = pParams->m_nOffset;
		m_dirtyMaxOffset = pParams->m_nOffset + pParams->m_nSize;

		CPinnedMemoryBuffer *pTempBuffer = m_pCtx->GetCurPinnedMemoryBuffer();

		m_nPinnedMemoryOfs = pTempBuffer->GetOfs();

		resultPtr = static_cast<char*>( pTempBuffer->GetPtr() ) + m_nPinnedMemoryOfs;
		
		pTempBuffer->Append( pParams->m_nSize );
	}
#endif // OSX
	else if ( !g_bDisableStaticBuffer && ( pParams->m_bDiscard || pParams->m_bNoOverwrite ) && ( pParams->m_nSize <= GL_STATIC_BUFFER_SIZE ) )
	{
#if TOGL_SUPPORT_NULL_DEVICE
		if ( !g_bNullD3DDevice )
#endif
		{
			if ( pParams->m_bDiscard )
			{
				m_pCtx->BindBufferToCtx( m_type, this );

				// observe gl_bufmode on any orphan event.
				// if orphaned and bufmode is nonzero, flip it to dynamic.
				GLenum hint = gl_bufmode.GetInt() ? GL_DYNAMIC_DRAW_ARB : GL_STATIC_DRAW_ARB;
				gGL->glBufferDataARB( m_buffGLTarget, m_nSize, (const GLvoid*)NULL, hint );
			
				m_nRevision++; // revision grows on orphan event
			}
		}

		m_dirtyMinOffset = pParams->m_nOffset;
		m_dirtyMaxOffset = pParams->m_nOffset + pParams->m_nSize;

		switch ( m_type )
		{
			case kGLMVertexBuffer:
			{
				m_pStaticBuffer = m_StaticBuffers[ 0 ];
				break;
			}
			case kGLMIndexBuffer:
			{
				m_pStaticBuffer = m_StaticBuffers[ 1 ];
				break;
			}
			default:
			{
				DXABSTRACT_BREAK_ON_ERROR();
				return;
			}
		}

		resultPtr = m_pStaticBuffer;
	}
	else
	{
		// bind (yes, even for pseudo - this binds name 0)
		m_pCtx->BindBufferToCtx( m_type, this );

		// perform discard if requested
		if ( pParams->m_bDiscard )
		{
			// observe gl_bufmode on any orphan event.
			// if orphaned and bufmode is nonzero, flip it to dynamic.
			
			// We always want to call glBufferData( ..., NULL ) on discards, even though we're using the GL_MAP_INVALIDATE_BUFFER_BIT flag, because this flag is actually only a hint according to AMD.
			GLenum hint = gl_bufmode.GetInt() ? GL_DYNAMIC_DRAW_ARB : GL_STATIC_DRAW_ARB;
			gGL->glBufferDataARB( m_buffGLTarget, m_nSize, (const GLvoid*)NULL, hint );
									
			m_nRevision++;	// revision grows on orphan event
		}

		// adjust async map option appropriately, leave explicit flush unchanged
		SetModes( pParams->m_bNoOverwrite, m_bEnableExplicitFlush );

		// map
		char *mapPtr;
		if ( gGL->m_bHave_GL_ARB_map_buffer_range )
		{
			// m_bEnableAsyncMap is actually pParams->m_bNoOverwrite
			GLbitfield parms = GL_MAP_WRITE_BIT | ( m_bEnableAsyncMap ? GL_MAP_UNSYNCHRONIZED_BIT : 0 ) | ( pParams->m_bDiscard ? GL_MAP_INVALIDATE_BUFFER_BIT : 0 ) | ( m_bEnableExplicitFlush ? GL_MAP_FLUSH_EXPLICIT_BIT : 0 );

#ifdef REPORT_LOCK_TIME
			double flStart = Plat_FloatTime();
#endif

			mapPtr = (char*)gGL->glMapBufferRange( m_buffGLTarget, pParams->m_nOffset, pParams->m_nSize, parms);

#ifdef REPORT_LOCK_TIME
			double flEnd = Plat_FloatTime();
			if ( flEnd - flStart > 5.0 / 1000.0 )
			{
				int nDelta = ( int )( ( flEnd - flStart ) * 1000 );
				if ( nDelta > 2 )
				{
					Msg( "**** " );
				}
				Msg( "glMapBufferRange Time=%d: ( Name=%d BufSize=%d ) Target=%p Offset=%d LockSize=%d ", nDelta, m_nHandle, m_nSize, m_buffGLTarget, pParams->m_nOffset, pParams->m_nSize );
				if ( parms & GL_MAP_WRITE_BIT )
				{
					Msg( "GL_MAP_WRITE_BIT ");
				}
				if ( parms & GL_MAP_UNSYNCHRONIZED_BIT )
				{
					Msg( "GL_MAP_UNSYNCHRONIZED_BIT ");
				}
				if ( parms & GL_MAP_INVALIDATE_BUFFER_BIT )
				{
					Msg( "GL_MAP_INVALIDATE_BUFFER_BIT ");
				}
				if ( parms & GL_MAP_INVALIDATE_RANGE_BIT )
				{
					Msg( "GL_MAP_INVALIDATE_RANGE_BIT ");
				}
				if ( parms & GL_MAP_FLUSH_EXPLICIT_BIT )
				{
					Msg( "GL_MAP_FLUSH_EXPLICIT_BIT ");
				}
				Msg( "\n" );
			}
#endif
		}
		else
		{
			mapPtr = (char*)gGL->glMapBufferARB( m_buffGLTarget, GL_WRITE_ONLY_ARB );
		}

		Assert( mapPtr );
				
		// calculate offset location
		resultPtr = mapPtr;
		if ( !gGL->m_bHave_GL_ARB_map_buffer_range )
		{
			resultPtr += pParams->m_nOffset;
		}

		// set range
		m_dirtyMinOffset = pParams->m_nOffset;
		m_dirtyMaxOffset = pParams->m_nOffset + pParams->m_nSize;
	}

	if ( m_bUsingPersistentBuffer != bUsingPersistentBuffer )
	{
		// Up the revision number when switching from a persistent to a non persistent buffer (or vice versa)
		// Ensure the right GL buffer is bound before drawing (and vertex attribs properly set)
		m_nRevision++;
		m_bUsingPersistentBuffer = bUsingPersistentBuffer;
	}

	m_bMapped = true;

	m_pLastMappedAddress = (float*)resultPtr;
	
	*pAddressOut = resultPtr;
}

void CGLMBuffer::Unlock( int nActualSize, const void *pActualData )
{
#if GL_TELEMETRY_GPU_ZONES
	CScopedGLMPIXEvent glmPIXEvent( "CGLMBuffer::Unlock" );
	g_TelemetryGPUStats.m_nTotalBufferLocksAndUnlocks++;
#endif

	m_pCtx->CheckCurrent();
	
	if ( !m_bMapped )
	{
		DXABSTRACT_BREAK_ON_ERROR();
		return;
	}

	if ( nActualSize < 0 )
	{
		nActualSize = m_LockParams.m_nSize;
	}

	if ( nActualSize > (int)m_LockParams.m_nSize )
	{
		DXABSTRACT_BREAK_ON_ERROR();
		return;
	}

#if GL_ENABLE_UNLOCK_BUFFER_OVERWRITE_DETECTION
	if ( m_bPseudo )
	{
		// Check guard DWORD to detect buffer overruns (but are still within the last 4KB page so they don't get caught via pagefaults)
		if ( *reinterpret_cast< const uint32 * >( m_pPseudoBuf + m_nSize ) != 0xDEADBEEF )
		{
			// If this fires the client app has overwritten the guard DWORD beyond the end of the buffer.
			DXABSTRACT_BREAK_ON_ERROR();
		}

		static const uint s_nInitialValues[4] = { 0xEF, 0xBE, 0xAD, 0xDE };

		int nActualModifiedStart, nActualModifiedEnd;
		for ( nActualModifiedStart = 0; nActualModifiedStart < (int)m_LockParams.m_nSize; ++nActualModifiedStart )
			if ( reinterpret_cast< const uint8 * >( m_pLastMappedAddress )[nActualModifiedStart] != s_nInitialValues[ ( m_LockParams.m_nOffset + nActualModifiedStart ) & 3 ] )
				break;

		for ( nActualModifiedEnd = m_LockParams.m_nSize - 1; nActualModifiedEnd > nActualModifiedStart; --nActualModifiedEnd )
			if ( reinterpret_cast< const uint8 * >( m_pLastMappedAddress )[nActualModifiedEnd] != s_nInitialValues[ ( m_LockParams.m_nOffset + nActualModifiedEnd ) & 3 ] )
				break;

		int nNumActualBytesModified = 0;

		if ( nActualModifiedEnd >= nActualModifiedStart )
		{
			// The modified check is conservative (i.e. it should always err on the side of detecting <= actual bytes than where actually modified, never more).
			// We primarily care about the case where the user lies about the actual # of modified bytes, which can lead to difficult to debug/inconsistent problems with some drivers.
			// Round up/down the modified range, because the user's data may alias with the initial buffer values (0xDEADBEEF) so we may miss some bytes that where written.
			if ( m_type == kGLMIndexBuffer )
			{
				nActualModifiedStart &= ~1;
				nActualModifiedEnd = MIN( (int)m_LockParams.m_nSize, ( ( nActualModifiedEnd + 1 ) + 1 ) & ~1 ) - 1;
			}
			else
			{
				nActualModifiedStart &= ~3;
				nActualModifiedEnd = MIN( (int)m_LockParams.m_nSize, ( ( nActualModifiedEnd + 1 ) + 3 ) & ~3 ) - 1;
			}
		
			nNumActualBytesModified = nActualModifiedEnd + 1;

			if ( nActualSize < nNumActualBytesModified )
			{
				// The caller may be lying about the # of actually modified bytes in this lock.
				// Has this lock region been previously locked? If so, it may have been previously overwritten before. Otherwise, the region had to be the 0xDEADBEEF fill DWORD at lock time.
				if ( ( m_nDirtyRangeStart > m_nDirtyRangeEnd ) ||
				     ( m_LockParams.m_nOffset > m_nDirtyRangeEnd ) || ( ( m_LockParams.m_nOffset + m_LockParams.m_nSize ) <= m_nDirtyRangeStart )  )
				{
					// If this fires the client has lied about the actual # of bytes they've modified in the buffer - this will cause unreliable rendering on AMD drivers (because AMD actually pays attention to the actual # of flushed bytes).
					DXABSTRACT_BREAK_ON_ERROR();
				}
			}
		
			m_nDirtyRangeStart = MIN( m_nDirtyRangeStart, m_LockParams.m_nOffset + nActualModifiedStart );
			m_nDirtyRangeEnd = MAX( m_nDirtyRangeEnd, m_LockParams.m_nOffset + nActualModifiedEnd );
		}

#if GL_ENABLE_INDEX_VERIFICATION
		if ( nActualModifiedEnd >= nActualModifiedStart )
		{
			int n = nActualModifiedEnd + 1;
			if ( n != nActualSize )
			{
				// The actual detected modified size is < than the reported size, which is common because the last few DWORD's of the vertex format may not actually be used/written (or read by the vertex shader). So just fudge it so the batch consumption checks work.
				if ( ( (int)nActualSize - n ) <= 32 )
				{
					n = nActualSize;
				}
			}

			m_BufferSpanManager.AddSpan( m_LockParams.m_nOffset + nActualModifiedStart, m_LockParams.m_nSize, n - nActualModifiedStart, m_LockParams.m_bDiscard, m_LockParams.m_bNoOverwrite );
		}
#endif		
	}
#elif GL_ENABLE_INDEX_VERIFICATION
	if ( nActualSize > 0 )
	{
		m_BufferSpanManager.AddSpan( m_LockParams.m_nOffset, m_LockParams.m_nSize, nActualSize, m_LockParams.m_bDiscard, m_LockParams.m_bNoOverwrite );
	}
#endif

#if GL_BATCH_PERF_ANALYSIS
	if ( m_type == kGLMIndexBuffer )
		g_nTotalIBLockBytes += nActualSize;
	else if ( m_type == kGLMVertexBuffer )
		g_nTotalVBLockBytes += nActualSize;
#endif

#ifndef OSX
	if ( m_nPinnedMemoryOfs >= 0 )
	{
#if TOGL_SUPPORT_NULL_DEVICE
		if ( !g_bNullD3DDevice )
		{
#endif
		if ( nActualSize )
		{
			m_pCtx->BindBufferToCtx( m_type, this );

			gGL->glCopyBufferSubData( 
				GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD,
				m_buffGLTarget,
				m_nPinnedMemoryOfs,
				m_dirtyMinOffset,
				nActualSize );
		}

#if TOGL_SUPPORT_NULL_DEVICE
		}
#endif
		
		m_nPinnedMemoryOfs = -1;
	}
	else
#endif // OSX
	if ( m_bUsingPersistentBuffer )
	{
		if ( nActualSize )
		{
			CPersistentBuffer *pTempBuffer = m_pCtx->GetCurPersistentBuffer( m_type );
			pTempBuffer->Append( nActualSize );

			//DevMsg( "   <-- actualSize=%d, persistOffset = %d\n", nActualSize, pTempBuffer->GetOffset() );
		}
	}
    else if ( m_pStaticBuffer )
	{
#if TOGL_SUPPORT_NULL_DEVICE
		if ( !g_bNullD3DDevice )
#endif
		{
			if ( nActualSize )
			{
				tmZone( TELEMETRY_LEVEL2, TMZF_NONE, "UnlockSubData" );

	#ifdef REPORT_LOCK_TIME
				double flStart = Plat_FloatTime();
	#endif
				m_pCtx->BindBufferToCtx( m_type, this );
				
				Assert( nActualSize <= (int)( m_dirtyMaxOffset - m_dirtyMinOffset ) );

				glBufferSubDataMaxSize( m_buffGLTarget, m_dirtyMinOffset, nActualSize, pActualData ? pActualData : m_pStaticBuffer );
						
		#ifdef REPORT_LOCK_TIME
				double flEnd = Plat_FloatTime();
				if ( flEnd - flStart > 5.0 / 1000.0 )
				{
					int nDelta = ( int )( ( flEnd - flStart ) * 1000 );
					if ( nDelta > 2 )
					{
						Msg( "**** " );
					}
					// Msg( "glBufferSubData Time=%d: ( Name=%d BufSize=%d ) Target=%p Offset=%d Size=%d\n", nDelta, m_nHandle, m_nSize, m_buffGLTarget, m_dirtyMinOffset, m_dirtyMaxOffset - m_dirtyMinOffset );
				}
	#endif		
			}
		}

		m_pStaticBuffer = NULL;
	}
	else if ( m_bPseudo )
	{
		if ( pActualData )
		{
			memcpy( m_pLastMappedAddress, pActualData, nActualSize );
		}

#if GL_ENABLE_UNLOCK_BUFFER_OVERWRITE_DETECTION
		uint nProtectOfs = m_LockParams.m_nOffset & 4095;
		uint nProtectEnd = ( m_LockParams.m_nOffset + m_LockParams.m_nSize + 4095 ) & ~4095;
		uint nProtectSize = nProtectEnd - nProtectOfs;

		DWORD nOldProtect;
		BOOL bResult = VirtualProtect( m_pActualPseudoBuf + nProtectOfs, nProtectSize, PAGE_READONLY, &nOldProtect );
		if ( !bResult )
		{
			Error( "VirtualProtect() failed!\n" );
		}
#endif
	}
	else
	{
		tmZone( TELEMETRY_LEVEL2, TMZF_NONE, "UnlockUnmap" );

		if ( pActualData )
		{
			memcpy( m_pLastMappedAddress, pActualData, nActualSize );
		}

		m_pCtx->BindBufferToCtx( m_type, this );

		Assert( nActualSize <= (int)( m_dirtyMaxOffset - m_dirtyMinOffset ) );

		// time to do explicit flush (currently m_bEnableExplicitFlush is always true)
		if ( m_bEnableExplicitFlush )
		{
			FlushRange( m_dirtyMinOffset, nActualSize );
		}
		
		// clear dirty range no matter what
		m_dirtyMinOffset = m_dirtyMaxOffset = 0;								// adjust/grow on lock, clear on unlock

#ifdef REPORT_LOCK_TIME
		double flStart = Plat_FloatTime();
#endif

		gGL->glUnmapBuffer( m_buffGLTarget );

#ifdef REPORT_LOCK_TIME
		double flEnd = Plat_FloatTime();
		if ( flEnd - flStart > 5.0 / 1000.0 )
		{
			int nDelta = ( int )( ( flEnd - flStart ) * 1000 );
			if ( nDelta > 2 )
			{
				Msg( "**** " );
			}
			Msg( "glUnmapBuffer Time=%d: ( Name=%d BufSize=%d ) Target=%p\n", nDelta, m_nHandle, m_nSize, m_buffGLTarget );
		}
#endif		
	}

	m_bMapped = false;
}

GLuint CGLMBuffer::GetHandle() const
{ 
	return ( m_bUsingPersistentBuffer ? m_pCtx->GetCurPersistentBuffer( m_type )->GetHandle() : m_nHandle ); 
}