//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
//
//
//
//==================================================================================================

//--------------------------------------------------------------------------------------------------
// Headers
//--------------------------------------------------------------------------------------------------

#include "SpuMgr_spu.h"
#include <cell/atomic.h>

#ifndef _CERT
#include <libsn_spu.h>
#endif

#include <stdlib.h>
#include <string.h>

//--------------------------------------------------------------------------------------------------
// Globals
//--------------------------------------------------------------------------------------------------

// singleton instance
SpuMgr gSpuMgr __attribute__((aligned(128)));
unsigned char gUnalignedMem[16] __attribute__((aligned(16)));
MemCpyHeader gMemCpyHeader __attribute__((aligned(16)));

//--------------------------------------------------------------------------------------------------
//
//--------------------------------------------------------------------------------------------------

void SPU_memcpy( void *pBuf1, void *pBuf2 )
{	
	uint32_t header;

	gSpuMgr.ReadMailbox( &header );

	gSpuMgr.MemcpyLock();

	gSpuMgr.DmaGetUNSAFE( &gMemCpyHeader, header, sizeof( MemCpyHeader ), 0 );
	gSpuMgr.DmaDone( 0x1 );

	DEBUG_ERROR( ( gMemCpyHeader.src & 0xf ) == 0 );

	uint32_t sizeAligned;
	uint32_t sizeAlignedDown;
	uint32_t dstAlignedDown;
	uint32_t offset;

	memcpy( gUnalignedMem, gMemCpyHeader.cacheLine, 16 );

	while ( gMemCpyHeader.size > 8192 )
	{
		sizeAligned		= 8192;
		dstAlignedDown	= SPUMGR_ALIGN_DOWN( gMemCpyHeader.dst, 16 );
		offset			= gMemCpyHeader.dst - dstAlignedDown;

		gSpuMgr.DmaGetUNSAFE( pBuf1, gMemCpyHeader.src, sizeAligned, 0 );
		gSpuMgr.DmaDone( 0x1 );

		if ( offset )
		{
			memcpy( pBuf2, gUnalignedMem, offset );
		}

		memcpy( (void *) ( (uint32_t) pBuf2 + offset ), pBuf1, sizeAligned );

		gSpuMgr.DmaSync();
		gSpuMgr.DmaPut( dstAlignedDown, pBuf2, SPUMGR_ALIGN_UP( sizeAligned + offset, 16 ), 0 );
		gSpuMgr.DmaDone( 0x1 );

		sizeAlignedDown = SPUMGR_ALIGN_DOWN( sizeAligned + offset, 16 );
		memcpy( gUnalignedMem, (void *) ( (uint32_t) pBuf2 + sizeAlignedDown ), 16 );

		gMemCpyHeader.size -= sizeAligned;

		gMemCpyHeader.dst += 8192;
		gMemCpyHeader.src += 8192;
	}

	sizeAligned		= SPUMGR_ALIGN_UP( gMemCpyHeader.size, 16 );
	dstAlignedDown	= SPUMGR_ALIGN_DOWN( gMemCpyHeader.dst, 16 );
	offset			= gMemCpyHeader.dst - dstAlignedDown;

	gSpuMgr.DmaGetUNSAFE( pBuf1, gMemCpyHeader.src, sizeAligned, 0 );
	gSpuMgr.DmaDone( 0x1 );

	if ( offset )
	{
		memcpy( pBuf2, gUnalignedMem, offset );
	}

	memcpy( (void *) ( (uint32_t) pBuf2 + offset ), pBuf1, gMemCpyHeader.size );

	sizeAligned = SPUMGR_ALIGN_UP( gMemCpyHeader.size + offset, 16 );

	gSpuMgr.DmaSync();
	gSpuMgr.DmaPut( dstAlignedDown, pBuf2, sizeAligned, 0 );
	gSpuMgr.DmaDone( 0x1 );

	if ( gMemCpyHeader.blocking )
	{
		gSpuMgr.WriteMailbox( 0 );
	}

	gSpuMgr.MemcpyUnlock();
}


//--------------------------------------------------------------------------------------------------
// DmaCheckAlignment
//   
//   	Checks restrictions specified in SpuMgr::DmaGet
//--------------------------------------------------------------------------------------------------

int DmaCheckAlignment(uint32_t src, uint32_t dest, uint32_t size)
{
#if !defined( _CERT )
	
	uint32_t align = size;
	bool error = false;

	if (size >= 16 && ((size & 0xf) == 0))
	{
		align = 16;                
	}
	else if (size == 8 || size == 4 || size == 2 || size == 1)
	{
		error = ((src & 0xF) != (dest & 0xF));
	}
	else
	{
		error = true;  // bad size
	}

	return (!error && src && dest &&
			SPUMGR_IS_ALIGNED(src, align) &&
			SPUMGR_IS_ALIGNED(dest, align));
			

#else //!FINAL
	return 1;
#endif //!FINAL
}

//--------------------------------------------------------------------------------------------------
//
//--------------------------------------------------------------------------------------------------

int SpuMgr::Init()
{
	// Start the decrementer since it is possible
	// that it has not been started by default

	const unsigned int kEventDec = 0x20;

	// Disable the decrementer event.
	unsigned int maskEvents = spu_readch(SPU_RdEventStatMask);
	spu_writech(SPU_WrEventMask, maskEvents & ~kEventDec);

	// Acknowledge any pending events and stop the decrementer.
	spu_writech(SPU_WrEventAck, kEventDec);

	// Write the decrementer value to start the decrementer.
	unsigned int decValue = spu_readch(SPU_RdDec);
	spu_writech(SPU_WrDec, decValue);

	// Enable events.
	spu_writech(SPU_WrEventMask, maskEvents | kEventDec);

	// Reset byte count
	ResetBytesTransferred();

	// reset malloc count
	m_mallocCount = 0;

	// Read the effective address of the SPU locks.
	ReadMailbox( &m_lockEA );
	ReadMailbox( &m_memcpyLockEA );

	return 0;
}

//--------------------------------------------------------------------------------------------------
//
//--------------------------------------------------------------------------------------------------

void SpuMgr::Term()
{
}

//--------------------------------------------------------------------------------------------------
// SpuMgr::DmaGet
//
// DmaGet       - alignment and size checking
// DmaGetUNSAFE - no alignment or size checking (but will assert in debug)
// _DmaGet  - handles badly aligned dma's, should be a private member really (doesn't handle small dma's)
//
// DMA restrictions
//		An MFC supports naturally aligned DMA transfer sizes of 1, 2, 4, 
//		8, and 16 bytes and multiples of 16 bytes
//		Furthermore, if size is 1, 2, 4, or 8 bytes then lower 4 bits
//		of LS and EA must match
//
//	Note:
//		Peak performance is achieved for transfers in which both the EA and 
//		the LSA are 128-byte aligned and the size of the transfer is a multiple 
//		of 128 bytes.
//--------------------------------------------------------------------------------------------------


void SpuMgr::DmaGetUNSAFE(void *ls, uint32_t ea, uint32_t size, uint32_t tagId)
{
	DEBUG_ERROR( ea < 0xd0000000 );
	DEBUG_ERROR( ea );
	DEBUG_ERROR(DmaCheckAlignment((uint32_t)ls, ea, size));

	// do the dma
	while (size)
	{
		uint32_t dmaSize = 0x4000;
		dmaSize = (size < dmaSize)? size: dmaSize;
		size -= dmaSize;

		// kick off dma
		spu_mfcdma64( (void*)ls, 0, ea, dmaSize, tagId, MFC_GET_CMD);
		m_numDMATransfers++;

		ls = (void*)((uint32_t)ls + dmaSize);
		ea += dmaSize;
	}

	// add up bytes transferred
	m_bytesRequested   += size;
	m_bytesTransferred += size;
}


//--------------------------------------------------------------------------------------------------
// SpuMgr::_DmaGet
//
//	Internal function - do not call this directly
//--------------------------------------------------------------------------------------------------

void SpuMgr::_DmaGet(void *ls, uint32_t ea, uint32_t size, uint32_t tagId)
{
	uint32_t unaligned = false;
	uint32_t eaAligned = (uint32_t)ea;
	uint32_t sizeAligned = size;
	uint32_t lsAligned = (uint32_t)ls;
	uint32_t sizeOffset = 0;
	char *pTempBuff = NULL;

	// check if src is unaligned
	if (eaAligned & 0xF)
	{
		eaAligned = eaAligned & ~0xF;	// round down
		sizeOffset = ea - eaAligned;
		sizeAligned += sizeOffset;
		unaligned = true;
	}

	// check if size is unaligned
	if (sizeAligned & 0xF)
	{
		sizeAligned = (sizeAligned + 0xF) & ~0xF;	// round up
		unaligned = true;
	}

	// if we have adjusted the size, or if ls is unaligned,
	// we need to alloc temp buffer
	if (unaligned || (lsAligned & 0xF))
	{
		pTempBuff = (char*)MemAlign(0x10, sizeAligned);

		lsAligned = (uint32_t)pTempBuff;
		unaligned = true;
	}

	// add up bytes transferred, for informational purposes
	m_bytesRequested += size;
	m_bytesTransferred += sizeAligned;

	// do the dma
	while (sizeAligned)
	{
		uint32_t dmaSize = 0x4000;
		dmaSize = (sizeAligned < dmaSize)? sizeAligned: dmaSize;
		sizeAligned -= dmaSize;

		// kick off dma
		spu_mfcdma64( (void*)lsAligned, 0, eaAligned, dmaSize, tagId, MFC_GET_CMD);
		m_numDMATransfers++;

		lsAligned += dmaSize;
		eaAligned += dmaSize;
	}

	if (unaligned)
	{
		// block for now till dma done because we do the memcpy right here
		DmaDone(1 << tagId);

		// copy data over
		memcpy(ls, pTempBuff + sizeOffset, size);

		// free temp buff
		Free(pTempBuff);
	}
}

//--------------------------------------------------------------------------------------------------
// SpuMgr::DmaGetSAFE
//
//	DMA restrictions (look at SpuMgr::DmaGetUNSAFE in this file) are 
//	handled transparently by this function
//--------------------------------------------------------------------------------------------------

void SpuMgr::DmaGetSAFE(void *ls, uint32_t ea, uint32_t size, uint32_t tagId)
{
	DEBUG_ERROR( ea );

	if( size < 0x10 )
	{
		// lowest 4 bits of address have to match regardless, &
		// size can only be 1, 2, 4 or 8 B

		if( size==0x1 || size==0x2 || size==0x4 || size==0x8 )
		{
			if( ((uint32_t)ls&0xF == ea&0xF) )
			{
				DmaGetUNSAFE(ls,ea,size,tagId);
			}
			else
			{
				// small get not aligned within a 16B block
				_DmaGet(ls,ea,size,tagId);
			}
		}
		else
		{
			// if < 16B can only get 1,2,4 or 8B
			_DmaGet(ls,ea,size,tagId);
		}
	}
	else
	{
		if( (!(size & 0xF)) &&			// has to be multiple of 16B, &
			(((uint32_t)ls&0xF)==0) &&	// ea and ls have to be 16B aligned
			((ea&0xF)==0)  )
		{
			// alignment is okay just dma
			DmaGetUNSAFE(ls,ea,size,tagId);
		}
		else
		{
			_DmaGet(ls,ea,size,tagId);
		}
	}
}

//--------------------------------------------------------------------------------------------------
// SpuMgr::DmaPut
//--------------------------------------------------------------------------------------------------

void SpuMgr::DmaPut(uint32_t ea, void *ls, uint32_t size, uint32_t tagId)
{
	DEBUG_ERROR( (ea!=0) && (ea<0xd0000000) );	// valid ea
	DEBUG_ERROR( (uint32_t)ls < 0x40000 );		// valid ls
	DEBUG_ERROR(DmaCheckAlignment((uint32_t)ls, ea, size));
	
	// do the dma
	while (size)
	{
		uint32_t dmaSize = 0x4000;
		dmaSize = (size < dmaSize)? size: dmaSize;
		size -= dmaSize;

		// initiate dma to ppu
		spu_mfcdma64( ls, 0, ea, dmaSize, tagId, MFC_PUT_CMD);

		ls = (void*)((uint32_t)ls + dmaSize);
		ea += dmaSize;
	}
}

//--------------------------------------------------------------------------------------------------
// SpuMgr::DmaSmallPut
//--------------------------------------------------------------------------------------------------

void SpuMgr::DmaSmallPut(uint32_t ea, void *ls, uint32_t size, uint32_t tagId)
{
	DEBUG_ERROR( (ea!=0) && (ea<0xd0000000) );	// valid ea
	DEBUG_ERROR( (uint32_t)ls < 0x40000 );		// valid ls
	DEBUG_ERROR(DmaCheckAlignment((uint32_t)ls, ea, size));

	uint32_t dmaSize = 1;

	if ((size % 8) == 0)
	{
		dmaSize = 8;
	}
	else if ((size % 4) == 0)
	{
		dmaSize = 4;
	}
	else if ((size % 2) == 0)
	{
		dmaSize = 2;
	}

	while (size)
	{
		size -= dmaSize;

		// initiate dma to ppu
		spu_mfcdma64( ls, 0, ea, dmaSize, tagId, MFC_PUT_CMD);

		ls = (void*)((uint32_t)ls + dmaSize);
		ea += dmaSize;
	}
}

//--------------------------------------------------------------------------------------------------
// SpuMgr::DmaGetlist
//
// Gather data scattered around main mem, MFC will run through the list, and place the elements (based on ea address and size)
// contiguously in ls.
//
// NOTE: if an individual list element size is <16B, the data will still be dma'd but the proceeding element will be placed 
// on the next 16B boundary. So it is possible to get lots of small elements, but you will be left with gaps in ls.
//
// ls - ls address of where items will be placed (contiguously)
// lsList - ls address of actual list
// sizeList - size of list in bytes (each list element is 8B (sizeof(DMAList)), so sizeList should be number of list elements // sizeof(DMAList))
// tagId - works the same way as regular DMA's
//
// Alignment and Size Restrictions:
// -ls and lsList must be 8B aligned
// -size must be a multiple of 8B (sizeof(DMAList))
// -no more than 2048 list elements
//
// light error checking right now
//--------------------------------------------------------------------------------------------------

void SpuMgr::DmaGetList(void *ls, DMAList *pLS_List, uint32_t sizeList, uint32_t tagId)
{
	DEBUG_ERROR( ((uint32_t)pLS_List&0x7) == 0 );	// ls address must be 8B aligned
	DEBUG_ERROR( ((uint32_t)ls&0x7) == 0 );			// ea so aligned also, due to offset within 16B alignment restrictions
	DEBUG_ERROR( (sizeList&0x7) == 0 );				// list size is a multiple of 8B
	DEBUG_ERROR( sizeList<(2048*sizeof(DMAList)));	// no more than 2048 list elements


	// initiate dma list
	spu_mfcdma64( ls, 0, (uint32_t)pLS_List, sizeList, tagId, MFC_GETL_CMD );
}

//--------------------------------------------------------------------------------------------------
// SpuMgr::DmaGPutlist
//   
// Scatter data held contiguously in ls, to main mem
// 
//   ls - ls address of where items exist (contiguously) to be scattered back to main mem
// lsList - ls address of actual list
// sizeList - size of list in bytes (each list element is 8B (sizeof(DMAList)), so sizeList should be number of list elements * sizeof(DMAList))
//   tagId - works the same way as regular DMA's
// 
// Alignment and Size Restrictions:
// ls and lsList must be 8B aligned, size must be a multiple of 8B (sizeof(DMAList))
// 
//   light error checking right now
//--------------------------------------------------------------------------------------------------

void SpuMgr::DmaPutList(void *ls, DMAList* pLS_List, uint32_t sizeList, uint32_t tagId)
{
	DEBUG_ERROR( ((uint32_t)pLS_List&0x7) == 0 );	// ls address must be 8B aligned
	DEBUG_ERROR( ((uint32_t)ls&0x7) == 0 );			// ea so aligned also, due to offset within 16B alignment restrictions
	DEBUG_ERROR( (sizeList&0x7) == 0 );				// list size is a multiple of 8B
	DEBUG_ERROR( sizeList<(2048*sizeof(DMAList)));	// no more than 2048 list elements

	// initiate dma list
	spu_mfcdma64( ls, 0, (uint32_t)pLS_List, sizeList, tagId, MFC_PUTL_CMD );
}