|
|
//====== Copyright � 1996-2007, Valve Corporation, All rights reserved. =======//
//
// Purpose:
//
// $NoKeywords: $
//
// A Fixed-allocation class for maintaining a 1d or 2d or 3d array of data in a structure-of-arrays
// (SOA) sse-friendly manner.
// =============================================================================//
#ifndef UTLSOACONTAINER_H
#define UTLSOACONTAINER_H
#ifdef _WIN32
#pragma once
#endif
#include "tier0/platform.h"
#include "tier0/dbg.h"
#include "tier0/threadtools.h"
#include "tier1/utlmemory.h"
#include "tier1/utlblockmemory.h"
#include "mathlib/ssemath.h"
// strided pointers. gives you a class that acts like a pointer, but the ++ and += operators do the
// right thing
template<class T> class CStridedPtr { protected: T *m_pData; size_t m_nStride; public: FORCEINLINE CStridedPtr<T>( void *pData, size_t nByteStride ) { m_pData = reinterpret_cast<T *>( pData ); m_nStride = nByteStride / sizeof( T ); }
FORCEINLINE CStridedPtr<T>( void ) {} T *operator->(void) const { return m_pData; } T & operator*(void) const { return *m_pData; } FORCEINLINE operator T *(void) { return m_pData; }
FORCEINLINE CStridedPtr<T> & operator++(void) { m_pData += m_nStride; return *this; }
FORCEINLINE void operator+=( size_t nNumElements ) { m_pData += nNumElements * m_nStride; }
FORCEINLINE size_t Stride( void ) const { return m_nStride;
}
};
template<class T> class CStridedConstPtr { protected: const T *m_pData; size_t m_nStride;
public: FORCEINLINE CStridedConstPtr<T>( void const *pData, size_t nByteStride ) { m_pData = reinterpret_cast<T const *>( pData ); m_nStride = nByteStride / sizeof( T ); }
FORCEINLINE CStridedConstPtr<T>( void ) {}
const T *operator->(void) const { return m_pData; }
const T & operator*(void) const { return *m_pData; }
FORCEINLINE operator const T *(void) const { return m_pData; }
FORCEINLINE CStridedConstPtr<T> &operator++(void) { m_pData += m_nStride; return *this; } FORCEINLINE void operator+=( size_t nNumElements ) { m_pData += nNumElements*m_nStride; } FORCEINLINE size_t Stride( void ) const { return m_nStride;
} };
// allowed field data types. if you change these values, you need to change the tables in the .cpp file
enum EAttributeDataType { ATTRDATATYPE_NONE = -1, // pad and varargs ender
ATTRDATATYPE_FLOAT = 0, // a float attribute
ATTRDATATYPE_4V, // vector data type, stored as class FourVectors
ATTRDATATYPE_INT, // integer. not especially sse-able on all architectures.
ATTRDATATYPE_POINTER, // a pointer.
ATTRDATATYPE_COUNT, };
#define MAX_SOA_FIELDS 32
class KMeansQuantizedValue; class IKMeansErrorMetric;
typedef fltx4 (*UNARYSIMDFUNCTION)( fltx4 const & ); typedef fltx4 (*BINARYSIMDFUNCTION)( fltx4 const &, fltx4 const & );
class CSOAAttributeReference;
/// mode of threading for a container. Normalyy automatically set based upon dimensions, but
/// controllable via SetThreadMode.
enum SOAThreadMode_t { SOATHREADMODE_NONE = 0, SOATHREADMODE_BYROWS = 1, SOATHREADMODE_BYSLICES = 2, SOATHREADMODE_BYROWS_AND_SLICES = 3,
SOATHREADMODE_AUTO = -1, // compute based upon dimensions
};
class CSOAContainer { friend class CSOAAttributeReference;
public: // Constructor, destructor
CSOAContainer( void ); // an empty one with no attributes
CSOAContainer( int nCols, int nRows, int nSlices, ... ); ~CSOAContainer( void );
// !!!!! UPDATE SERIALIZATION CODE WHENEVER THE STRUCTURE OF CSOAContainer CHANGES !!!!!
// To avoid dependency on datamodel, serialization is implemented in utlsoacontainer_serialization.cpp, in dmxloader.lib
//bool Serialize( CDmxElement *pRootElement );
//bool Unserialize( const CDmxElement *pRootElement );
// Set the data type for an attribute. If you set the data type, but tell it not to allocate,
// the data type will be set but writes will assert, and reads will give you back zeros. if
// AllocateData hasn't been called yet, this will set up for AllocateData to reserve space for
// this attribute. If you have already called AllocateData, but wish to add an attribute, you
// can also use this, which will result in separate memory being allocated for this attribute.
void SetAttributeType( int nAttrIdx, EAttributeDataType nDataType, bool bAllocateMemory = true ); EAttributeDataType GetAttributeType( int nAttrIdx ) const;
// Set the attribute type for a field, if that field is not already present (potentially
// allocating memory). You can use this, for instance, to make sure an already loaded image has
// an alpha channel.
void EnsureDataType( int nAttrIdx, EAttributeDataType nDataType );
// set back to un-initted state, freeing memory
void Purge( void );
// Allocate, purge data
void AllocateData( int nNCols, int nNRows, int nSlices = 1 ); // actually allocate the memory and set the pointers up
void PurgeData( void );
// Did the container allocate memory for this attribute?
bool HasAllocatedMemory( int nAttrIdx ) const;
// easy constructor for 2d using varargs. call like
// #define ATTR_RED 0
// #define ATTR_GREEN 1
// #define ATTR_BLUE 2
// CSOAContainer myimage( 256, 256, ATTR_RED, ATTRDATATYPE_FLOAT, ATTR_GREEN, ATTRDATATYPE_FLOAT,
// ATTR_BLUE, ATTRDATATYPE_FLOAT, -1 );
int NumCols( void ) const; int NumRows( void ) const; int NumSlices( void ) const; void AssertDataType( int nAttrIdx, EAttributeDataType nDataType ) const;
// # of groups of 4 elements per row
int NumQuadsPerRow( void ) const; int Count( void ) const; // for 1d data
int NumElements( void ) const;
// how much to step to go from the end of one row to the start of the next one. Basically, how
// many bytes to add at the end of a row when iterating over the whole 2d array with ++
size_t RowToRowStep( int nAttrIdx ) const; template<class T> T *RowPtr( int nAttributeIdx, int nRowNumber, int nSliceNumber = 0 ) const; void const *ConstRowPtr( int nAttributeIdx, int nRowNumber, int nSliceNumber = 0 ) const; template<class T> T *ElementPointer( int nAttributeIdx, int nX = 0, int nY = 0, int nZ = 0 ) const; FourVectors *ElementPointer4V( int nAttributeIdx, int nX = 0, int nY = 0, int nZ = 0 ) const; size_t ItemByteStride( int nAttributeIdx ) const;
FORCEINLINE float &FloatValue( int nAttrIdx, int nX, int nY, int nZ ) const { AssertDataType( nAttrIdx, ATTRDATATYPE_FLOAT ); return RowPtr<float>( nAttrIdx, nY, nZ )[nX]; }
// return a reference to an attribute, which can have operations performed on it. For instance,
// this is valid code to zero out the red component of a whole image:
// myImage[FBM_ATTR_RED] = 0.;
CSOAAttributeReference operator[]( int nAttrIdx );
// this is just an alias for readbaility w/ ptrs. instead of (*p)[FBM_ATTR_RED], you can do p->Attr( FBM_ATTR_RED );
FORCEINLINE CSOAAttributeReference Attr( int nAttrIdx );
// copy the attribute data from another soacontainer. must be compatible geometry.
void CopyAttrFrom( CSOAContainer const &other, int nDestAttributeIdx, int nSrcAttributeIndex = -1 );
// copy the attribute data from another attribute. must be compatible data format
void CopyAttrToAttr( int nSrcAttributeIndex, int nDestAttributeIndex);
// copy a subvolume of attribute data from one container to another.
void CopyRegionFrom( CSOAContainer const &src, int nSrcAttr, int nDestAttr, int nSrcMinX, int nSrcMaxX, int nSrcMinY, int nSrcMaxY, int nSrcMinZ, int nSrcMaxZ, int nDestX, int nDestY, int nDestZ );
// copy all fields from a region of src to this.
void CopyRegionFrom( CSOAContainer const &src, int nSrcMinX, int nSrcMaxX, int nSrcMinY, int nSrcMaxY, int nSrcMinZ, int nSrcMaxZ, int nDestX, int nDestY, int nDestZ );
// move all the data from one csoacontainer to another, leaving the source empty. this is just
// a pointer copy.
FORCEINLINE void MoveDataFrom( CSOAContainer other );
// arithmetic and data filling functions. All SIMD and hopefully fast
/// set all elements of a float attribute to random #s
void RandomizeAttribute( int nAttr, float flMin, float flMax ) const;
/// this.attr = vec
void FillAttr( int nAttr, Vector const &vecValue );
/// this.attr = float
void FillAttr( int nAttr, float flValue );
/// this.nDestAttr *= src.nSrcAttr
void MulAttr( CSOAContainer const &src, int nSrcAttr, int nDestAttr );
/// Returns the result of repeatedly combining attr values with the initial value using the specified function.
/// For instance, SumAttributeValue is just ReduceAttr<AddSIMD>( attr, FOUR_ZEROS );
template<BINARYSIMDFUNCTION fn> float ReduceAttr( int nSrcAttr, fltx4 const &fl4InitialValue ) const;
template<BINARYSIMDFUNCTION fn> void ApplyBinaryFunctionToAttr( int nDestAttr, fltx4 const &flFnArg1 );
/// this.attr = fn1( fn2( attr, arg2 ), arg1 )
template<BINARYSIMDFUNCTION fn1, BINARYSIMDFUNCTION fn2> void ApplyTwoComposedBinaryFunctionsToAttr( int nDestAttr, fltx4 const &flFnArg1, fltx4 const &flFnArg2 );
/// this.nDestAttr *= flValue
void MulAttr( int nDestAttr, float flScale ) { ApplyBinaryFunctionToAttr<MulSIMD>( nDestAttr, ReplicateX4( flScale ) ); }
void AddToAttr( int nDestAttr, float flAddend ) { ApplyBinaryFunctionToAttr<AddSIMD>( nDestAttr, ReplicateX4( flAddend ) ); }
// this.attr = max( this.attr, flminvalue )
void MaxAttr( int nDestAttr, float flMinValue ) { ApplyBinaryFunctionToAttr<MaxSIMD>( nDestAttr, ReplicateX4( flMinValue ) ); }
/// this.attr = min( this.attr, flminvalue )
void MinAttr( int nDestAttr, float flMaxValue ) { ApplyBinaryFunctionToAttr<MinSIMD>( nDestAttr, ReplicateX4( flMaxValue ) ); }
void ClampAttr( int nDestAttr, float flMinValue, float flMaxValue ) { ApplyTwoComposedBinaryFunctionsToAttr<MinSIMD, MaxSIMD>( nDestAttr, ReplicateX4( flMaxValue ), ReplicateX4( flMinValue ) ); }
/// this.attr = normalize( this.attr )
void NormalizeAttr( int nAttr );
/// fill 2d a rectangle with values interpolated from 4 corner values.
void FillAttrWithInterpolatedValues( int nAttr, float flValue00, float flValue10, float flValue01, float flValue11 ) const; void FillAttrWithInterpolatedValues( int nAttr, Vector flValue00, Vector flValue10, Vector const &flValue01, Vector const &flValue11 ) const;
/// grab 3 scalar attributes from one csoaa and fill in a fourvector attr in.
void PackScalarAttributesToVectorAttribute( CSOAContainer *pInput, int nVecAttributeOut, int nScalarAttributeX, int nScalarAttributeY, int nScalarAttributeZ );
/// grab the 3 components of a vector attribute and store in 3 scalar attributes.
void UnPackVectorAttributeToScalarAttributes( CSOAContainer *pInput, int nVecAttributeIn, int nScalarAttributeX, int nScalarAttributeY, int nScalarAttributeZ ); /// this.attrout = src.attrin * vec (component by component )
void MultiplyVectorAttribute( CSOAContainer *pInput, int nAttributeIn, Vector const &vecScalar, int nAttributeOut );
/// Given an soa container of a different dimension, resize one attribute from it to fit this
/// table's geometry. point sampling only
void ResampleAttribute( CSOAContainer &pInput, int nAttr );
/// sum of all floats in an attribute
float SumAttributeValue( int nAttr ) const;
/// sum(attr) / ( w * h * d )
float AverageFloatAttributeValue( int nAttr ) const;
/// maximum float value in a float attr
float MaxAttributeValue( int nAttr ) const;
/// minimum float value in a float attr
float MinAttributeValue( int nAttr ) const;
/// scalartargetattribute += w*exp( vecdir dot ndirection)
void AddGaussianSRBF( float flWeight, Vector vecDir, int nDirectionAttribute, int nScalarTargetAttribute );
/// vec3targetattribute += w*exp( vecdir dot ndirection)
void AddGaussianSRBF( Vector vecWeight, Vector vecDir, int nDirectionAttribute, int nVectorTargetAttribute );
/// find the largest value of a vector attribute
void FindLargestMagnitudeVector( int nAttr, int *nx, int *ny, int *nz );
void KMeansQuantization( int const *pFieldIndices, int nNumFields, KMeansQuantizedValue *pOutValues, int nNumResultsDesired, IKMeansErrorMetric *pErrorCalculator, int nFieldToStoreIndexInto, int nNumIterations, int nChannelToReceiveErrorSignal = -1 );
// Calculate the signed distance, in voxels, between all voxels and a surface boundary defined
// by nSrcField being >0. Voxels with nSrcField <0 will end up with negative distances. Voxels
// with nSrcField == 0 will get 0, and nSrcField >0 will yield positive distances. Note the
// min/max x/y/z fields don't reflect the range to be written, but rather represent the bounds
// of updated voxels that you want your distance field modified to take into account. This
// volume will be bloated based upon the nMaxDistance parameter and simd padding. A
// brute-force algorithm is used, but it is threaded and simd'd. Large "nMaxDistance" values
// applied to large images can take a long time, as the execution time per output pixel is
// proportional to maxdistance^2. The rect argument, if passed, will be modified to be the
// entire rectangle modified by the operation.
void GenerateDistanceField( int nSrcField, int nDestField, int nMaxDistance, Rect3D_t *pRect = NULL );
void SetThreadMode( SOAThreadMode_t eThreadMode );
protected: int m_nColumns; // # of rows and columns created with
int m_nRows; int m_nSlices; int m_nPaddedColumns; // # of columns rounded up for sse
int m_nNumQuadsPerRow; // # of groups of 4 elements per row
uint8 *m_pDataMemory; // the actual data memory
uint8 *m_pAttributePtrs[MAX_SOA_FIELDS]; EAttributeDataType m_nDataType[MAX_SOA_FIELDS]; size_t m_nStrideInBytes[MAX_SOA_FIELDS]; // stride from one field datum to another
size_t m_nRowStrideInBytes[MAX_SOA_FIELDS]; // stride from one row datum to another per field
size_t m_nSliceStrideInBytes[MAX_SOA_FIELDS]; // stride from one slice datum to another per field
uint32 m_nFieldPresentMask; uint8 *m_pConstantDataMemory; uint8 *m_pSeparateDataMemory[MAX_SOA_FIELDS]; // for fields allocated separately from the main allocation
SOAThreadMode_t m_eThreadMode; // set thread mode
FORCEINLINE void Init( void ) { memset( m_nDataType, 0xff, sizeof( m_nDataType ) ); memset( m_pSeparateDataMemory, 0, sizeof( m_pSeparateDataMemory ) ); #ifdef _DEBUG
memset( m_pAttributePtrs, 0xFF, sizeof( m_pAttributePtrs ) ); memset( m_nStrideInBytes, 0xFF, sizeof( m_nStrideInBytes ) ); memset( m_nRowStrideInBytes, 0xFF, sizeof( m_nRowStrideInBytes ) ); memset( m_nSliceStrideInBytes, 0xFF, sizeof( m_nSliceStrideInBytes ) ); #endif
m_pConstantDataMemory = NULL; m_pDataMemory = 0; m_nNumQuadsPerRow = 0; m_nColumns = m_nPaddedColumns = m_nRows = m_nSlices = 0; m_nFieldPresentMask = 0; m_eThreadMode = SOATHREADMODE_NONE; }
void UpdateDistanceRow( int nSearchRadius, int nMinX, int nMaxX, int nY, int nZ, int nSrcField, int nDestField );
// parallel helper functions. These do the work, and all take a row/column range as their first arguments.
void CopyAttrFromPartial( int nStartRow, int nNumRows, int nStartSlice, int nEndSlice, CSOAContainer const *pOther, int nDestAttributeIndex, int nSrcAttributeIndex ); void FillAttrPartial( int nStartRow, int nNumRows, int nStartSlice, int nEndSlice, int nAttr, fltx4 fl4Value );
// Allocation utility funcs (NOTE: all allocs are multiples of 16, and are aligned allocs)
size_t DataMemorySize( void ) const; // total bytes of data memory to allocate at m_pDataMemory (if all attributes were allocated in a single block)
size_t ConstantMemorySize( void ) const; // total bytes of constant memory to allocate at m_pConstantDataMemory (if all constant attributes were allocated in a single block)
size_t AttributeMemorySize( int nAttrIndex ) const; // total bytes of data memory allocated to a single attribute (constant or otherwise)
void AllocateDataMemory( void ); void AllocateConstantMemory( void ); };
// define binary op class to allow this construct without temps:
// dest( FBM_ATTR_RED ) = src( FBM_ATTR_BLUE ) + src( FBM_ATTR_GREEN )
template<BINARYSIMDFUNCTION fn> class CSOAAttributeReferenceBinaryOp;
class CSOAAttributeReference { friend class CSOAContainer;
class CSOAContainer *m_pContainer; int m_nAttributeID;
public: FORCEINLINE void operator *=( float flScale ) const { m_pContainer->MulAttr( m_nAttributeID, flScale ); } FORCEINLINE void operator +=( float flAddend ) const { m_pContainer->AddToAttr( m_nAttributeID, flAddend ); } FORCEINLINE void operator -=( float flAddend ) const { m_pContainer->AddToAttr( m_nAttributeID, -flAddend ); } FORCEINLINE void operator =( float flValue ) const { m_pContainer->FillAttr( m_nAttributeID, flValue ); }
FORCEINLINE void operator =( CSOAAttributeReference const &other ) const { m_pContainer->CopyAttrFrom( *other.m_pContainer, m_nAttributeID, other.m_nAttributeID ); }
template<BINARYSIMDFUNCTION fn> FORCEINLINE void operator =( CSOAAttributeReferenceBinaryOp<fn> const &op );
FORCEINLINE void CopyTo( CSOAAttributeReference &other ) const; // since operator= is over-ridden
};
// define binary op class to allow this construct without temps:
// dest( FBM_ATTR_RED ) = src( FBM_ATTR_BLUE ) + src( FBM_ATTR_GREEN )
template<BINARYSIMDFUNCTION fn> class CSOAAttributeReferenceBinaryOp { public: CSOAAttributeReference m_opA; CSOAAttributeReference m_opB;
CSOAAttributeReferenceBinaryOp( CSOAAttributeReference const &a, CSOAAttributeReference const & b ) { a.CopyTo( m_opA ); b.CopyTo( m_opB ); }
};
#define DEFINE_OP( opname, fnname ) \
FORCEINLINE CSOAAttributeReferenceBinaryOp<fnname> operator opname( CSOAAttributeReference const &left, CSOAAttributeReference const &right ) \ { \ return CSOAAttributeReferenceBinaryOp<fnname>( left, right ); \ }
// these operator overloads let you do
// dst[ATT1] = src1[ATT] + src2[ATT] with no temporaries generated
DEFINE_OP( +, AddSIMD ); DEFINE_OP( *, MulSIMD ); DEFINE_OP( -, SubSIMD ); DEFINE_OP( /, DivSIMD );
template<BINARYSIMDFUNCTION fn> FORCEINLINE void CSOAAttributeReference::operator =( CSOAAttributeReferenceBinaryOp<fn> const &op ) { m_pContainer->AssertDataType( m_nAttributeID, ATTRDATATYPE_FLOAT ); fltx4 *pOut = m_pContainer->RowPtr<fltx4>( m_nAttributeID, 0 ); // GCC on PS3 gets confused by this code, so we literally have to break it into multiple statements
CSOAContainer *pContainerA = op.m_opA.m_pContainer; CSOAContainer *pContainerB = op.m_opB.m_pContainer; fltx4 *pInA = pContainerA->RowPtr< fltx4 >( op.m_opA.m_nAttributeID, 0 ); fltx4 *pInB = pContainerB->RowPtr< fltx4 >( op.m_opB.m_nAttributeID, 0 );
size_t nRowToRowStride = m_pContainer->RowToRowStep( m_nAttributeID ) / sizeof( fltx4 ); int nRowCtr = m_pContainer->NumRows() * m_pContainer->NumSlices(); do { int nColCtr = m_pContainer->NumQuadsPerRow(); do { *(pOut++) = fn( *( pInA++ ), *( pInB++ ) ); } while ( --nColCtr ); pOut += nRowToRowStride; pInA += nRowToRowStride; pInB += nRowToRowStride; } while ( --nRowCtr ); }
FORCEINLINE void CSOAAttributeReference::CopyTo( CSOAAttributeReference &other ) const { other.m_pContainer = m_pContainer; other.m_nAttributeID = m_nAttributeID; }
FORCEINLINE CSOAAttributeReference CSOAContainer::operator[]( int nAttrIdx ) { CSOAAttributeReference ret; ret.m_pContainer = this; ret.m_nAttributeID = nAttrIdx; return ret; }
FORCEINLINE CSOAAttributeReference CSOAContainer::Attr( int nAttrIdx ) { return (*this)[nAttrIdx]; }
template<BINARYSIMDFUNCTION fn1, BINARYSIMDFUNCTION fn2> void CSOAContainer::ApplyTwoComposedBinaryFunctionsToAttr( int nDestAttr, fltx4 const &fl4FnArg1, fltx4 const &fl4FnArg2 ) { if ( m_nDataType[nDestAttr] == ATTRDATATYPE_4V ) { FourVectors *pOut = RowPtr<FourVectors>( nDestAttr, 0 ); size_t nRowToRowStride = RowToRowStep( nDestAttr ) / sizeof( FourVectors ); int nRowCtr = NumRows() * NumSlices(); do { int nColCtr = NumQuadsPerRow(); do { pOut->x = fn1( fn2( pOut->x, fl4FnArg2 ), fl4FnArg1 ); pOut->y = fn1( fn2( pOut->y, fl4FnArg2 ), fl4FnArg1 ); pOut->z = fn1( fn2( pOut->z, fl4FnArg2 ), fl4FnArg1 ); } while ( --nColCtr ); pOut += nRowToRowStride; } while ( --nRowCtr ); } else { AssertDataType( nDestAttr, ATTRDATATYPE_FLOAT ); fltx4 *pOut = RowPtr<fltx4>( nDestAttr, 0 ); size_t nRowToRowStride = RowToRowStep( nDestAttr ) / sizeof( fltx4 ); int nRowCtr = NumRows() * NumSlices(); do { int nColCtr = NumQuadsPerRow(); do { *pOut = fn1( fn2( *pOut, fl4FnArg2 ), fl4FnArg1 ); pOut++; } while ( --nColCtr ); pOut += nRowToRowStride; } while ( --nRowCtr ); } }
template<BINARYSIMDFUNCTION fn> void CSOAContainer::ApplyBinaryFunctionToAttr( int nDestAttr, fltx4 const &fl4FnArg1 ) { if ( m_nDataType[nDestAttr] == ATTRDATATYPE_4V ) { FourVectors *pOut = RowPtr<FourVectors>( nDestAttr, 0 ); size_t nRowToRowStride = RowToRowStep( nDestAttr ) / sizeof( FourVectors ); int nRowCtr = NumRows() * NumSlices(); do { int nColCtr = NumQuadsPerRow(); do { pOut->x = fn( pOut->x, fl4FnArg1 ); pOut->y = fn( pOut->y, fl4FnArg1 ); pOut->z = fn( pOut->z, fl4FnArg1 ); } while ( --nColCtr ); pOut += nRowToRowStride; } while ( --nRowCtr ); } else { AssertDataType( nDestAttr, ATTRDATATYPE_FLOAT ); fltx4 *pOut = RowPtr<fltx4>( nDestAttr, 0 ); size_t nRowToRowStride = RowToRowStep( nDestAttr ) / sizeof( fltx4 ); int nRowCtr = NumRows() * NumSlices(); do { int nColCtr = NumQuadsPerRow(); do { *pOut = fn( *pOut, fl4FnArg1 ); pOut++; } while ( --nColCtr ); pOut += nRowToRowStride; } while ( --nRowCtr ); } }
template<BINARYSIMDFUNCTION fn> float CSOAContainer::ReduceAttr( int nSrcAttr, fltx4 const &fl4InitialValue ) const { AssertDataType( nSrcAttr, ATTRDATATYPE_FLOAT ); fltx4 fl4Result = fl4InitialValue; fltx4 const *pIn = RowPtr<fltx4>( nSrcAttr, 0 ); size_t nRowToRowStride = RowToRowStep( nSrcAttr ) / sizeof( fltx4 ); int nRowCtr = NumRows() * NumSlices(); bi32x4 fl4LastColumnMask = (bi32x4)LoadAlignedSIMD( g_SIMD_SkipTailMask[NumCols() & 3 ] ); do { for( int i = 0; i < NumQuadsPerRow() - 1; i++ ) { fl4Result = fn( fl4Result, *( pIn++ ) ); } // handle the last column in case its not a multiple of 4 wide
fl4Result = MaskedAssign( fl4LastColumnMask, fn( fl4Result, *( pIn++ ) ), fl4Result ); pIn += nRowToRowStride; } while ( --nRowCtr ); // now, combine the subfields
fl4Result = fn( fn( fl4Result, SplatYSIMD( fl4Result ) ), fn( SplatZSIMD( fl4Result ), SplatWSIMD( fl4Result ) ) ); return SubFloat( fl4Result, 0 ); }
#define QUANTIZER_NJOBS 1 // # of simultaneous subjobs to execute for kmeans quantizer
// kmeans quantization classes
// the array of quantized values returned by quantization
class KMeansQuantizedValue { public: FourVectors m_vecValuePosition; // replicated
fltx4 m_fl4Values[MAX_SOA_FIELDS]; // replicated
float m_flValueAccumulators[QUANTIZER_NJOBS][MAX_SOA_FIELDS]; float m_flWeightAccumulators[QUANTIZER_NJOBS];
FORCEINLINE float operator()( int n ) { return SubFloat( m_fl4Values[n], 0 ); }
};
class KMeansSampleDescriptor { public: fltx4 *m_pInputValues[MAX_SOA_FIELDS];
FORCEINLINE fltx4 const & operator()( int nField ) const { return *m_pInputValues[nField]; }
};
class IKMeansErrorMetric { public: virtual void CalculateError( KMeansSampleDescriptor const &sampleAddresses, FourVectors const &v4SamplePositions, KMeansQuantizedValue const &valueToCompareAgainst, fltx4 *pfl4ErrOut ) =0;
// for things like normalization, etc
virtual void PostAdjustQuantizedValue( KMeansQuantizedValue &valueToAdjust ) { }
// for global fixup after each adjustment step
virtual void PostStep( int const *pFieldIndices, int nNumFields, KMeansQuantizedValue *pValues, int nNumQuantizedValues, int nIndexField, CSOAContainer &data ) { }
};
FORCEINLINE CSOAContainer::CSOAContainer( void ) { Init(); }
//-----------------------------------------------------------------------------
// Did the container allocate memory for this attribute?
//-----------------------------------------------------------------------------
FORCEINLINE bool CSOAContainer::HasAllocatedMemory( int nAttrIdx ) const { return ( m_nFieldPresentMask & ( 1 << nAttrIdx ) ) != 0; }
FORCEINLINE EAttributeDataType CSOAContainer::GetAttributeType( int nAttrIdx ) const { Assert( ( nAttrIdx >= 0 ) && ( nAttrIdx < MAX_SOA_FIELDS ) ); return m_nDataType[nAttrIdx]; }
FORCEINLINE void CSOAContainer::EnsureDataType( int nAttrIdx, EAttributeDataType nDataType ) { if ( !HasAllocatedMemory( nAttrIdx ) ) { SetAttributeType( nAttrIdx, nDataType ); } }
FORCEINLINE int CSOAContainer::NumRows( void ) const { return m_nRows; }
FORCEINLINE int CSOAContainer::NumCols( void ) const { return m_nColumns; } FORCEINLINE int CSOAContainer::NumSlices( void ) const { return m_nSlices; }
FORCEINLINE void CSOAContainer::AssertDataType( int nAttrIdx, EAttributeDataType nDataType ) const { Assert( nAttrIdx >= 0 ); Assert( nAttrIdx < MAX_SOA_FIELDS ); Assert( m_nDataType[ nAttrIdx ] == nDataType ); }
// # of groups of 4 elements per row
FORCEINLINE int CSOAContainer::NumQuadsPerRow( void ) const { return m_nNumQuadsPerRow; }
FORCEINLINE int CSOAContainer::Count( void ) const // for 1d data
{ return NumCols(); }
FORCEINLINE int CSOAContainer::NumElements( void ) const { return NumCols() * NumRows() * NumSlices(); }
// how much to step to go from the end of one row to the start of the next one. Basically, how
// many bytes to add at the end of a row when iterating over the whole 2d array with ++
FORCEINLINE size_t CSOAContainer::RowToRowStep( int nAttrIdx ) const { return 0; }
template<class T> FORCEINLINE T *CSOAContainer::RowPtr( int nAttributeIdx, int nRowNumber, int nSliceNumber ) const { Assert( nRowNumber < m_nRows ); Assert( nAttributeIdx < MAX_SOA_FIELDS ); Assert( m_nDataType[nAttributeIdx] != ATTRDATATYPE_NONE ); Assert( ( m_nFieldPresentMask & ( 1 << nAttributeIdx ) ) || ( ( nRowNumber == 0 ) && ( nSliceNumber == 0 ) ) ); return reinterpret_cast<T *>( m_pAttributePtrs[nAttributeIdx] + + nRowNumber * m_nRowStrideInBytes[nAttributeIdx] + nSliceNumber * m_nSliceStrideInBytes[nAttributeIdx] ); }
FORCEINLINE void const *CSOAContainer::ConstRowPtr( int nAttributeIdx, int nRowNumber, int nSliceNumber ) const { Assert( nRowNumber < m_nRows ); Assert( nAttributeIdx < MAX_SOA_FIELDS ); Assert( m_nDataType[nAttributeIdx] != ATTRDATATYPE_NONE ); return m_pAttributePtrs[nAttributeIdx] + nRowNumber * m_nRowStrideInBytes[nAttributeIdx] + nSliceNumber * m_nSliceStrideInBytes[nAttributeIdx]; }
template<class T> FORCEINLINE T *CSOAContainer::ElementPointer( int nAttributeIdx, int nX, int nY, int nZ ) const { Assert( nAttributeIdx < MAX_SOA_FIELDS ); Assert( nX < m_nColumns ); Assert( nY < m_nRows ); Assert( nZ < m_nSlices ); Assert( m_nDataType[nAttributeIdx] != ATTRDATATYPE_NONE ); Assert( m_nDataType[nAttributeIdx] != ATTRDATATYPE_4V ); return reinterpret_cast<T *>( m_pAttributePtrs[nAttributeIdx] + nX * m_nStrideInBytes[nAttributeIdx] + nY * m_nRowStrideInBytes[nAttributeIdx] + nZ * m_nSliceStrideInBytes[nAttributeIdx] ); }
FORCEINLINE FourVectors *CSOAContainer::ElementPointer4V( int nAttributeIdx, int nX, int nY, int nZ ) const { Assert( nAttributeIdx < MAX_SOA_FIELDS ); Assert( nX < m_nColumns ); Assert( nY < m_nRows ); Assert( nZ < m_nSlices ); Assert( m_nDataType[nAttributeIdx] == ATTRDATATYPE_4V ); int nXIdx = nX / 4; uint8 *pRet = m_pAttributePtrs[nAttributeIdx] + nXIdx * 4 * m_nStrideInBytes[nAttributeIdx] + nY * m_nRowStrideInBytes[nAttributeIdx] + nZ * m_nSliceStrideInBytes[nAttributeIdx]; pRet += 4 * ( nX & 3 ); return reinterpret_cast<FourVectors *>( pRet ); } FORCEINLINE size_t CSOAContainer::ItemByteStride( int nAttributeIdx ) const { Assert( nAttributeIdx < MAX_SOA_FIELDS ); Assert( m_nDataType[nAttributeIdx] != ATTRDATATYPE_NONE ); return m_nStrideInBytes[ nAttributeIdx ]; }
// move all the data from one csoacontainer to another, leaving the source empty.
// this is just a pointer copy.
FORCEINLINE void CSOAContainer::MoveDataFrom( CSOAContainer other ) { (*this) = other; other.Init(); }
class CFltX4AttributeIterator : public CStridedConstPtr<fltx4> { FORCEINLINE CFltX4AttributeIterator( CSOAContainer const *pContainer, int nAttribute, int nRowNumber = 0 ) : CStridedConstPtr<fltx4>( pContainer->ConstRowPtr( nAttribute, nRowNumber), pContainer->ItemByteStride( nAttribute ) ) { } };
class CFltX4AttributeWriteIterator : public CStridedPtr<fltx4> { FORCEINLINE CFltX4AttributeWriteIterator( CSOAContainer const *pContainer, int nAttribute, int nRowNumber = 0 ) : CStridedPtr<fltx4>( pContainer->RowPtr<uint8>( nAttribute, nRowNumber), pContainer->ItemByteStride( nAttribute ) ) { } };
FORCEINLINE FourVectors CompressSIMD( FourVectors const &a, FourVectors const &b ) { FourVectors ret; ret.x = CompressSIMD( a.x, b.x ); ret.y = CompressSIMD( a.y, b.y ); ret.z = CompressSIMD( a.z, b.z ); return ret; }
FORCEINLINE FourVectors Compress4SIMD( FourVectors const &a, FourVectors const &b, FourVectors const &c, FourVectors const &d ) { FourVectors ret; ret.x = Compress4SIMD( a.x, b.x, c.x, d.x ); ret.y = Compress4SIMD( a.y, b.y, c.y, d.y ); ret.z = Compress4SIMD( a.z, b.z, c.z, d.z ); return ret; }
#endif
|