#include "platform.h"
#include "box_buoyancy.h"
#include "mathlib/vector4d.h"
#include "hardware_clock_fast.h"


inline const Vector ToVector( const fltx4 & f4 )
{
	return Vector( SubFloat( f4, 0 ), SubFloat( f4, 1 ), SubFloat( f4, 2 ) );
}

#ifdef _X360
FORCEINLINE fltx4 PermYXZW( const fltx4 & a )
{
	return __vpermwi( a,  0x4B ); // 01001011b
}
FORCEINLINE fltx4 PermXZYW( const fltx4 & a )
{
	return __vpermwi( a,  0x27 ); // 00100111b
}
FORCEINLINE fltx4 PermZYXW( const fltx4 & a )
{
	return __vpermwi( a,  0x93 ); // 10010011b
}
FORCEINLINE fltx4 PermXXYW( const fltx4 & a )
{
	return __vpermwi( a,  0x07 ); // 00000111b
}
FORCEINLINE fltx4 PermYZZW( const fltx4 & a )
{
	return __vpermwi( a,  0x6B ); // 01101011b
}
FORCEINLINE fltx4 Sum3SIMD( const fltx4 &a )
{
	return __vmsum3fp( a, Four_Ones );
}
FORCEINLINE fltx4 CombineSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z, const fltx4 & w )
{
	fltx4 r0 = __vmrghw(x, z);
	fltx4 r1 = __vmrghw(y, w);

	return __vmrghw(r0, r1);
}

// Assumes Y(xbox),Z(PC) are splatted
FORCEINLINE fltx4 CombineXYZ_Special( const fltx4 & x, const fltx4 & y, const fltx4 & z )
{
	fltx4 r0 = __vmrghw(x, z);
	return __vmrghw(r0, y);
}

#elif defined( _PS3 )

const int32 ALIGN16 g_SIMD_YXZW[4] ALIGN16_POST = { 0x04050607, 0x00010203, 0x08090A0B, 0x0C0D0E0F };
const int32 ALIGN16 g_SIMD_XZYW[4] ALIGN16_POST = { 0x00010203, 0x08090A0B, 0x04050607, 0x0C0D0E0F };
const int32 ALIGN16 g_SIMD_ZYXW[4] ALIGN16_POST = { 0x08090A0B, 0x04050607, 0x00010203, 0x0C0D0E0F };
const int32 ALIGN16 g_SIMD_XXYW[4] ALIGN16_POST = { 0x00010203, 0x00010203, 0x04050607, 0x0C0D0E0F };
const int32 ALIGN16 g_SIMD_YZZW[4] ALIGN16_POST = { 0x04050607, 0x08090A0B, 0x08090A0B, 0x0C0D0E0F };

FORCEINLINE fltx4 PermYXZW( const fltx4 & a )
{
	return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_YXZW ) );
}
FORCEINLINE fltx4 PermXZYW( const fltx4 & a )
{
	return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_XZYW ) );
}
FORCEINLINE fltx4 PermZYXW( const fltx4 & a )
{
	return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_ZYXW ) );
}
FORCEINLINE fltx4 PermXXYW( const fltx4 & a )
{
	return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_XXYW ) );
}
FORCEINLINE fltx4 PermYZZW( const fltx4 & a )
{
	return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_YZZW ) );
}
FORCEINLINE fltx4 Sum3SIMD( const fltx4 &a )
{
	return SplatXSIMD( a ) + SplatYSIMD( a ) + SplatZSIMD( a );
}

const int32 ALIGN16 g_SIMD_XAXA[4] ALIGN16_POST = { 0x00010203, 0x10111213, 0x00010203, 0x10111213 };
const int32 ALIGN16 g_SIMD_XYAB[4] ALIGN16_POST = { 0x00010203, 0x10111213, 0x00010203, 0x10111213 };
FORCEINLINE fltx4 CombineSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z, const fltx4 & w )
{
	//fltx4 xy = vec_perm(x, y, LoadAlignedIntSIMD( g_SIMD_XAXA ) );
	//fltx4 zw = vec_perm(z, w, LoadAlignedIntSIMD( g_SIMD_XAXA ) );
	fltx4 xzxz = vec_mergeh(x, z);
	fltx4 ywyw = vec_mergeh(y, w);

	return vec_mergeh(xzxz, ywyw);
}

// Assumes Y(xbox),Z(PC) are splatted
FORCEINLINE fltx4 CombineXYZ_Special( const fltx4 & x, const fltx4 & y, const fltx4 & z )
{
	fltx4 r0 = vec_mergeh(x, z);
	return vec_mergeh(r0, y);
}
#else
FORCEINLINE fltx4 PermYXZW( const fltx4 & a )
{
	return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 2, 0, 1 ) );
}
FORCEINLINE fltx4 PermXZYW( const fltx4 & a )
{
	return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 1, 2, 0 ) );
}
FORCEINLINE fltx4 PermZYXW( const fltx4 & a )
{
	return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 0, 1, 2 ) );
}
FORCEINLINE fltx4 PermXXYW( const fltx4 & a )
{
	return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 1, 0, 0 ) );
}

FORCEINLINE fltx4 PermYZZW( const fltx4 & a )
{
	return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 2, 2, 1 ) );
}
FORCEINLINE fltx4 Sum3SIMD( const fltx4 &a )
{
	return SplatXSIMD( a ) + SplatYSIMD( a ) + SplatZSIMD( a );
}
FORCEINLINE fltx4 CombineSIMD( const fltx4 & row0, const fltx4 & row1, const fltx4 & row2, const fltx4 & row3 )
{
	fltx4 tmp0 = _mm_shuffle_ps( row0, row1, 0x44);
	fltx4 tmp1 = _mm_shuffle_ps( row2, row3, 0x44);
	return _mm_shuffle_ps(tmp0, tmp1, 0x88);
}

// Assumes Y(xbox),Z(PC) are splatted
FORCEINLINE fltx4 CombineXYZ_Special( const fltx4 & x, const fltx4 & y, const fltx4 & z )
{
	fltx4 tmp0 = _mm_shuffle_ps( x, y, 0x44);
	return _mm_shuffle_ps(tmp0, z, 0x88);
}


#endif


fltx4 GetBoxBuoyancy4x3( const fltx4& f4a, const fltx4& f4b, const fltx4&f4c, const fltx4&f4Origin )
{
	FourVectors box;
	box.LoadAndSwizzle( f4a, f4b, f4c, f4Origin );
	return GetBoxBuoyancy3x4( box );
}

void BenchmarkBoxBuoyancy4x3( const fltx4& f4a, const fltx4& f4b, const fltx4&f4c, const fltx4&f4Origin )
{
	FourVectors box;
	box.LoadAndSwizzle( f4a, f4b, f4c, f4Origin );
	fltx4 result = {0, 0, 0, 0};

	int start, end;
	const int nIterations = 1000000;

	start = GetHardwareClockFast();
	for ( int i = 0; i < nIterations; ++i )
	{
		result = result + GetBoxBuoyancy3x4( box );
		box.x = AndSIMD( box.x, box.x );
	}
	end = GetHardwareClockFast();
	Msg( "Box Buoyancy 4x3 Benchmark: %d ticks/box, volume %g \n", int32( ( end - start ) ) / nIterations, SubFloat( result, 3 ) / nIterations );
}


/*
inline fltx4 operator - ( const fltx4 & a, const fltx4 & b )
{
	return SubSIMD( a, b );
}

inline fltx4 operator + ( const fltx4 & a, const fltx4 & b )
{
	return AddSIMD( a, b );
}

inline fltx4 operator * ( const fltx4 & a, const fltx4 & b )
{
	return MulSIMD( a, b );
}
*/

inline fltx4 Bound( const fltx4 & a, const fltx4 &low, const fltx4 &high )
{
	return MinSIMD( MaxSIMD( a, low ), high );
}

inline fltx4 Limit01( const fltx4 & a )
{
	return MinSIMD( MaxSIMD( Four_Zeros, a ), Four_Ones );
}


const fltx4 Four_One6th = { 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f };
const fltx4 Four_One4th = { 0.25f, 0.25f, 0.25f, 0.25f };
const fltx4 Four_One12th = { 1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f };

// integral A .. 1 of : y (tipZ + (baseZ - tipZ) y) dy
inline fltx4 TriHelperIntegralAto1( const fltx4 &alpha, const fltx4 &tipZ, const fltx4 &baseZ )
{
	return MaddSIMD( Four_Thirds, baseZ,
	                 MsubSIMD( alpha * alpha, ( MsubSIMD( Four_Thirds, alpha * ( tipZ - baseZ ), Four_PointFives * tipZ ) ), Four_One6th * tipZ )
	               );
}


// integral A .. 1 of : y ((tipZ + (baseZ - tipZ) y)^2) dy
inline fltx4 TriHelperZ2IntegralAto1( const fltx4 &alpha, const fltx4 &tipZ, const fltx4 &baseZ )
{
	fltx4 alphaSqr = alpha * alpha;
	fltx4 alphaMinus1 = alpha - Four_Ones, alphaMinus1Sqr = alphaMinus1 * alphaMinus1;
	return Four_One4th*( Four_Ones - alphaSqr ) * ( alphaSqr + Four_Ones ) * baseZ*baseZ + ( Four_One6th + alphaSqr*alpha  * ( Four_PointFives * alpha - Four_TwoThirds ) )* baseZ *tipZ - alphaMinus1Sqr * alphaMinus1 * ( Four_One12th + Four_One4th * alpha ) * tipZ * tipZ;
}

// integral A .. 1 of : y (tipZ + (baseZ - tipZ) y) * (tipX + (baseX - tipX) y) dy
inline fltx4 TriHelperZ2IntegralAto1( const fltx4 &alpha, const fltx4 &tipZ, const fltx4 &baseZ, const fltx4 &tipX, const fltx4 &baseX )
{
	fltx4 alphaSqr = alpha * alpha;
	fltx4 alphaMinus1 = alpha - Four_Ones, alphaMinus1Sqr = alphaMinus1 * alphaMinus1;

	return ( alphaMinus1Sqr*tipX*( ( Four_Ones + alpha*( Four_Twos + Four_Threes*alpha ) )*baseZ + tipZ + ( Four_Twos - Four_Threes*alpha )*alpha*tipZ ) +
	         baseX*( -Four_Threes*( alphaSqr*alphaSqr - Four_Ones )*baseZ + tipZ + alphaSqr*alpha*( Four_Threes*alpha - Four_Fours )*tipZ ) ) * Four_One12th;
}


// integral 0 .. B of : y (tipZ + (baseZ - tipZ) y) dy
inline fltx4 TriHelperIntegral0toB( const fltx4 &beta, const fltx4 &tipZ, const fltx4 &baseZ )
{
	return beta * beta * ( MaddSIMD( Four_Thirds, ( baseZ - tipZ ) * beta, Four_PointFives * tipZ ) );
}


/*
double SubDbl( const fltx4& a, int i )
{
	return SubFloat( a, i );
}
*/

// integral 0 .. B of : y ((tipZ + (baseZ - tipZ) y)^2) dy
inline fltx4 TriHelperZ2Integral0toB( const fltx4 &beta, const fltx4 &tipZ, const fltx4 &baseZ )
{
	fltx4 dz = baseZ - tipZ;
	fltx4 bdz = beta * dz;

	fltx4 f4Integral = beta * beta * ( Four_One4th * bdz * bdz + Four_TwoThirds * bdz * tipZ + Four_PointFives * tipZ * tipZ );
	/*
		double testIntegral[4];
		for ( int i = 0; i < 4; ++i )
		{
			testIntegral[i] = SubDbl( beta, i ) * SubDbl( beta, i ) * ( SubDbl( Four_One4th, i ) * SubDbl( bdz, i ) * SubDbl( bdz, i ) + SubDbl( Four_TwoThirds, i ) * SubDbl( bdz, i ) * SubDbl( tipZ, i ) + SubDbl( Four_PointFives, i ) * SubDbl( tipZ, i ) * SubDbl( tipZ, i ) );
		}
	*/
	return f4Integral;
}

// integral 0 .. B of : y (tipZ + (baseZ - tipZ) y) (tipX + (baseX - tipX) y ) dy
// note: baseX should be the center of the base coordinate
inline fltx4 TriHelperZ2Integral0toB( const fltx4 &beta, const fltx4 &tipZ, const fltx4 &baseZ, const fltx4 &tipX, const fltx4 &baseX )
{
	fltx4 dz = baseZ - tipZ, dx = baseX - tipX;
	fltx4 betaSqr = beta * beta;

	fltx4 f4Integral = betaSqr * ( betaSqr * Four_One4th * dx * dz + Four_PointFives * tipX * tipZ + Four_Thirds * beta * ( baseZ * tipX + ( baseX - Four_Twos * tipX ) * tipZ ) );
	return f4Integral;
}


// this is 3 * Integral 0..1 of (z0+(z1-z0)y) dy
inline fltx4 TrplAvgSqrZ( const fltx4& z0, const fltx4 &z1 )
{
	return MaddSIMD( z0, z0 + z1, z1 * z1 );
}

inline fltx4 SixAvgSqrZX( const fltx4& z0, const fltx4 &z1, const fltx4& x0, const fltx4 &x1 )
{
	return x0 * MaddSIMD( Four_Twos, z0, z1 ) + x1 * MaddSIMD( Four_Twos, z1, z0 ) ;
}

const fltx4 f4Epsilon = {1e-6f, 1e-6f, 1e-6f, 1e-6f};


inline FourVectors Cross( const FourVectors &a, const FourVectors &b )
{
	FourVectors ret;
	ret.x=MsubSIMD( a.z, b.y, MulSIMD( a.y, b.z ) );
	ret.y=MsubSIMD( a.x, b.z, MulSIMD( a.z, b.x ) );
	ret.z=MsubSIMD( a.y, b.x, MulSIMD( a.x, b.y ) );
	return ret;
}

inline fltx4 CrossZ( const FourVectors &a, const FourVectors &b )
{
	return MsubSIMD( a.y, b.x, MulSIMD( a.x, b.y ) );
}

inline fltx4 Sqr( const fltx4 &a )
{
	return a * a;
}

inline FourVectors MsubSIMD( const FourVectors &a, const fltx4 &b, const FourVectors &c) // c-a*b
{
	FourVectors ret;
	ret.x = MsubSIMD(a.x, b, c.x );
	ret.y = MsubSIMD(a.y, b, c.y );
	ret.z = MsubSIMD(a.z, b, c.z );
	return ret;
}


const fltx4 g_f4_11h4 = {1,1,0.5f,4.0f};
const fltx4 g_f4_4424 = {4,4,2,4};
const fltx4 g_f4AlmostInifiniteSlope = {1e+24,1e+24,1e+24,1e+24};
const int32 ALIGN16 g_SIMD_signmask_W[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0xFFFFFFFF };
const int32 ALIGN16 g_SIMD_signmask_NoW[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0 };

// physical interpretation: we're integrating the pressure force (pascals) along the submerged surface.
// in other words, we substitute the usual volume integral for surface integral
// Xbox360: 1250 cycles; Core2 Quad: 500 cycles; Core i7: ? cycles   ; error: 2e-5
fltx4 GetBoxBuoyancy3x4( const FourVectors &box_in )
{
	FourVectors box; // sorted box
	// make (a,b,c).z > 0
	fltx4 f4SignMask = LoadAlignedSIMD( g_SIMD_signmask );
	fltx4 signZ = AndSIMD( box_in.z, f4SignMask );
	box.x = XorSIMD( box_in.x, signZ );
	box.y = XorSIMD( box_in.y, signZ );
	box.z = AndNotSIMD( f4SignMask, box_in.z );
	fltx4 boxCenterZ = SplatWSIMD( box_in.z ); // the height of the center of the box above the water level
	fltx4 boxCenterXY = AndSIMD( SetYSIMD( SplatWSIMD( box_in.x ), SplatWSIMD( box_in.y ) ), LoadAlignedSIMD( g_SIMD_SkipTailMask[2] ) );
	
	// there are a lot of scheduling holes on this stage, so we might as well precompute something
	// high point of the box, a+b+c
	fltx4 boxTopX = Sum3SIMD( box.x );
	fltx4 boxTopY = Sum3SIMD( box.y );
	fltx4 boxTopZrel = Sum3SIMD( box.z );
	fltx4 boxTopZabs = boxCenterZ + boxTopZrel, boxBotZ = boxCenterZ - boxTopZrel;

	// sort a.z > b.z > c.z > 0; sorting takes 43 cycles on xbox360
	bi32x4 swap_a_c = CmpLtSIMD( SplatXSIMD( box.z ), SplatZSIMD( box.z ) );
	box.x = MaskedAssign( swap_a_c, PermZYXW( box.x ), box.x );
	box.y = MaskedAssign( swap_a_c, PermZYXW( box.y ), box.y );
	box.z = MaskedAssign( swap_a_c, PermZYXW( box.z ), box.z );

	bi32x4 isBsmaller = CmpLtSIMD( SplatYSIMD( box.z ), box.z );
	bi32x4 ordered_a_b = SplatXSIMD( isBsmaller ); // if a > b, they're ordered correctly
	box.x = MaskedAssign( ordered_a_b, box.x, PermYXZW( box.x ) );
	box.y = MaskedAssign( ordered_a_b, box.y, PermYXZW( box.y ) );
	box.z = MaskedAssign( ordered_a_b, box.z, PermYXZW( box.z ) );

	bi32x4 swap_b_c = SplatZSIMD( isBsmaller ); // if b < c, we need to swap them
	box.x = MaskedAssign( swap_b_c, PermXZYW( box.x ), box.x );
	box.y = MaskedAssign( swap_b_c, PermXZYW( box.y ), box.y );
	box.z = MaskedAssign( swap_b_c, PermXZYW( box.z ), box.z );

	Assert( SubFloat( box.z, 0 ) >= SubFloat( box.z, 1 ) && SubFloat( box.z, 1 ) >= SubFloat( box.z, 2 ) && SubFloat( box.z, 2 ) >= 0 );

	// sorted and positive, time to integrate sides: (a,b) (a,c) (b,c)
	// (a-b).z > (b-a).z, so the a+b, a-b, b-a, -a-b is the order of corners, top-to-bottom
	FourVectors boxA, boxB; // these two represent a and b of each pair of edges defining
	boxA.x = PermXXYW( box.x );
	boxA.y = PermXXYW( box.y );
	boxA.z = PermXXYW( box.z );
	boxB.x = PermYZZW( box.x );
	boxB.y = PermYZZW( box.y );
	boxB.z = PermYZZW( box.z );
	FourVectors boxC; // "c" maps to ąc,b,a
	boxC.x = PermZYXW( box.x );
	boxC.y = PermZYXW( box.y );
	boxC.z = PermZYXW( box.z );

	// if a.z == 0 , b.z is also 0, so the whole rectangle is parallel to z=const
	bi32x4 isSideFlat = CmpLtSIMD( boxA.z, f4Epsilon );
	fltx4 rcpAz = AndNotSIMD( isSideFlat, ReciprocalSIMD( boxA.z ) );
	fltx4 rcp2Az = Four_PointFives * rcpAz;

	// the part of quad along a that's in the triangles cut by z=const surfaces
	// this is the same regardless of C
	//
	// tab size must = 4 spaces for the ASCII art below to make sense
	//
	//                     * (a+b)  cut = 0								a
	//				    /|									|			^
	//				   / |									|			|
	//		(a-b)     *--+        cut = f4CutPart			|			|
	//				  |	 |									|			|
	//				  |	 |									|			|     > b=
	//				  |	 |									V			|	 /
	//				  +--* (b-a)  cut = 1					cut,		|	/
	//				  | /									level,		|  /
	//				  |/									fraction,	| /
	//		(-a-b)    *			  cut = 1 + f4CutPart		etc.		|/
	//
	//
	//             (a+b)-(a-b)	  2b	b
	// computed as ----------- == -- == -
	//             (a+b)-(b-a)	  2a	a
	//
	fltx4 f4CutPart = MulSIMD( boxB.z, rcpAz );  // this must be between 0 (b is parallel to z=const) and 1 (a and b both have 45' slope)
	Assert( IsAllGreaterThanOrEq( Four_Ones + f4Epsilon, SetWToZeroSIMD( f4CutPart ) ) && IsAllGreaterThanOrEq( f4CutPart + f4Epsilon, SetWToZeroSIMD( Four_Zeros ) ) );
	//fltx4 rcpCutPart = AndSIMD( ReciprocalSIMD( f4CutPart ), CmpGtSIMD( f4CutPart, f4Epsilon ) );

	// integrate the full sides of the box, multiplied by the XY projection areas
	fltx4 f4SideProj = fabs( CrossZ( boxA, boxB ) );

	// here's the center-of-mass and total volume integral solution:
	//		{{4/3 (3 x0 z0 + xA zA + xB zB), 4/3 (3 y0 z0 + yA zA + yB zB), 2/3 (3 z0^2 + zA^2 + zB^2), 4 z0},
	//		{1/24 (4 x0 (3 z0 + zA + zB) + xA (4 z0 + 2 zA + zB) + xB (4 z0 + zA + 2 zB)), 
	// 		 1/24 (4 y0 (3 z0 + zA + zB) + yA (4 z0 + 2 zA + zB) + yB (4 z0 + zA + 2 zB)), 
	// 		 1/24 (6 z0^2 + zA^2 + zA zB + zB^2 + 4 z0 (zA + zB)), 
	// 		 1/6 (3 z0 + zA + zB)}}
	//fltx4 f4FullZ0_Cpos = boxCenterZ + boxC.z, f4FullZ0_Cneg = boxCenterZ - boxC.z;
	
	// 4/3 (3 x0 z0 + xA zA + xB zB) type of integral : take x0 z0 + (xA zA + xB zB) / 3
	// consider that x0 = ą boxC.x and z0 = boxCenterZ ą boxC.z, we're left with
	// ą boxCenter boxC.x + boxC.x boxC.z + (xA zA + xB zB) / 3
	// Again, the only part that changes is (ą boxCenterZ boxC.x)

	fltx4 f4Full_X_common = boxC.x * boxC.z + Four_Thirds * ( boxA.x * boxA.z + boxB.x * boxB.z );
	fltx4 f4Full_X_Cpos = Four_Fours * (boxCenterZ * boxC.x + f4Full_X_common); 
	fltx4 f4Full_X_Cneg = Four_Fours * (f4Full_X_common - boxCenterZ * boxC.x);  

	// y is the same as x

	fltx4 f4Full_Y_common = boxC.y * boxC.z + Four_Thirds * ( boxA.y * boxA.z + boxB.y * boxB.z );
	fltx4 f4Full_Y_Cpos = Four_Fours * ( boxCenterZ * boxC.y + f4Full_Y_common ) ;  
	fltx4 f4Full_Y_Cneg = Four_Fours * ( f4Full_Y_common - boxCenterZ * boxC.y ) ;  

	// z is different: 2/3 (3 z0^2 + zA^2 + zB^2) ;  z0 = boxCenterZ ą boxC.z, 
	// so we can just add the difference of 4 * boxCenterZ * boxC.z to get from Cneg to Cpos
	fltx4 f4Full_Z_common = Four_TwoThirds * ( Sqr( boxA.z ) + Sqr( boxB.z ) );
	fltx4 f4Full_Z_Cpos = MaddSIMD( Four_Twos, Sqr( boxCenterZ + boxC.z ), f4Full_Z_common );
	fltx4 f4Full_Z_Cneg = MaddSIMD( Four_Twos, Sqr( boxCenterZ - boxC.z ), f4Full_Z_common );

	fltx4 f4Full_W_Cpos = Four_Fours * ( boxCenterZ + boxC.z ), f4Full_W_Cneg = Four_Fours * ( boxCenterZ - boxC.z );

	// this is how we'd  compute the center of mass for fully-submerged cube, for validation
#ifdef _DEBUG
	fltx4 f4TestVolume = Dot3SIMD( f4Full_W_Cpos - f4Full_W_Cneg, f4SideProj );
	fltx4 f4TestSideProjDivVolume = f4SideProj * ReciprocalSIMD( f4TestVolume );
	fltx4 f4TestLeverX = Dot3SIMD( f4Full_X_Cpos - f4Full_X_Cneg, f4TestSideProjDivVolume ), f4TestLeverY = Dot3SIMD( f4Full_Y_Cpos - f4Full_Y_Cneg, f4TestSideProjDivVolume );
	fltx4 f4TestLeverZ = Dot3SIMD( f4Full_Z_Cpos - f4Full_Z_Cneg, f4TestSideProjDivVolume );
	fltx4 f4TestResult = CombineSIMD( f4TestLeverX + SplatWSIMD(box_in.x), f4TestLeverY + SplatWSIMD(box_in.y), f4TestLeverZ, f4TestVolume ); (void)f4TestResult;
#endif

	//
	//
	/////////////////////////////////////////////////////////////////////////////
	//   Computing Center parallelogram component of the full surface integral
	//

	// To compute the integral across the submerged part of each of 6 faces, we'll compute these components and then selectively sum them up
	// to form the full integral: the top and bottom triangle.
	// if the water level is intersecting top triangle ((a-b).z < 0) , we'll subtract top triangle integral from full integral
	// if the water level is intersecting bottom triangle ((b-a).z < 0) , we'll select just the bottom triangle integral
	// .. and we'll have to compute the middle part because it's not symmetrical ..

	// .. on the second thought, we	compute the center (parallelogram) , upper tri and lower tri

	// for the center computation, we need the point of the middle of the center and m=b-ra parallel to the water
	// waterTop is{ 0 = at V0 top; cut = at V1; 1 = at V2; 1+cut = at V3 bottom of the quad }
	// waterBot is central-symmetrical, negative

	// to find the fraction of right side of rectangle (the +b side) that has z=0
	// this is different for +C and -C sides
	//
	//              (a+b) ą c + p	   a+b ą c + p
	// computed as --------------  == ------------                // note: ą is typed by Alt + 0177
	//			   (a+b)-(b-a)	       2 a
	// 
	// Warning: I take special care in cases of flat faces (z=const, when rcpAz is undefined)
	//          in these cases, submerged faces must have water<=0 and faces above water (z>0) must have water >= 1 + cut
	// Note:    If I take care not to compute fully-submerged or fully-above-water polytopes, I only need to check 
	//          below-water case for Cneg faces and above-water case for Cpos faces
	//     
	// The trick I'm using here to account for everything is perturb the face's slope slightly to effectively divide by epsilon

	fltx4 rcp2AzSpecial = MaskedAssign( isSideFlat, g_f4AlmostInifiniteSlope, rcp2Az );
	fltx4 f4WaterPart_Cpos = boxTopZabs * rcp2AzSpecial, f4WaterPart_Cneg = MaddSIMD( boxBotZ, rcp2AzSpecial, f4CutPart )  + Four_Ones;
	
	// on the central piece, we need to integrate along axes (a,m = b - cut*a) and ranges {-1+cut...max(-1+cut,1-max(w,cut)) , -1...1}
	// even cut and w have the same denominator: it's cut=2b/2a  and  water=topZ/2a

	//fltx4 f4HighLimit_Cpos = MaxSIMD( f4LowLimit, Four_Ones - MaxSIMD( f4WaterPart_Cpos, f4CutPart ) );
	//fltx4 f4HighLimit_Cneg = MaxSIMD( f4LowLimit, Four_Ones - MaxSIMD( f4WaterPart_Cneg, f4CutPart ) );
	fltx4 f4TopWaterInCenter_Cpos = MinSIMD( Four_Ones, MaxSIMD( f4CutPart, f4WaterPart_Cpos ) );
	fltx4 f4TopWaterInCenter_Cneg = MinSIMD( Four_Ones, MaxSIMD( f4CutPart, f4WaterPart_Cneg ) );
	// the range is full (1 means full span of the whole center parallelogram)
	// but the origin is to be multiplied by A, so 1 means half of the length (-1 means 0 area)
	fltx4 f4CenterRange_Cpos = Four_Ones - f4TopWaterInCenter_Cpos, f4CenterOriginA_Cpos = f4CutPart - f4TopWaterInCenter_Cpos;
	fltx4 f4CenterRange_Cneg = Four_Ones - f4TopWaterInCenter_Cneg, f4CenterOriginA_Cneg = f4CutPart - f4TopWaterInCenter_Cneg;
	// given the span (we're integrating from -span to +span), we can compute the center point for integration: ((r-1) + (1-max(w,r)))/2
	// we can also compute the area of projection, because we reduce the area of the face by 1-max(r,w), i.e. by the span
	fltx4 f4CenterProj_Cpos = f4SideProj * f4CenterRange_Cpos, f4CenterProj_Cneg = f4SideProj * f4CenterRange_Cneg;
	fltx4 f4CenterRangeSqr_Cpos = f4CenterRange_Cpos * f4CenterRange_Cpos;
	fltx4 f4CenterRangeSqr_Cneg = f4CenterRange_Cneg * f4CenterRange_Cneg;


	// to integrate the central piece, we need the center point (posą(c-a*q)), q = ; and m=b-cut a
	// because it cancels out lots of terms in the integral

	FourVectors boxM = MsubSIMD( boxA, f4CutPart, boxB ); // m=b-ra, replacement for b in the integrals
	
		// here's the center-of-mass and total volume integral solution. M is our B in this case.
		//		{{4/3 (3 x0 z0 + xA zA + xM zM), 4/3 (3 y0 z0 + yA zA + yM zM), 2/3 (3 z0^2 + zA^2 + zM^2), 4 z0},
		//
		// and for triangles it would be this:
		//		{1/24 (4 x0 (3 z0 + zA + zM) + xA (4 z0 + 2 zA + zM) + xM (4 z0 + zA + 2 zM)), 
		// 		 1/24 (4 y0 (3 z0 + zA + zM) + yA (4 z0 + 2 zA + zM) + yM (4 z0 + zA + 2 zM)), 
		// 		 1/24 (6 z0^2 + zA^2 + zA zM + zM^2 + 4 z0 (zA + zM)), 
		// 		 1/6 (3 z0 + zA + zM)}}
		// ... but we only use the rectangular integral right now

		fltx4 f4CenterX0_Cpos =              boxC.x + f4CenterOriginA_Cpos * boxA.x, f4CenterX0_Cneg =              f4CenterOriginA_Cneg * boxA.x - boxC.x;
		fltx4 f4CenterY0_Cpos =              boxC.y + f4CenterOriginA_Cpos * boxA.y, f4CenterY0_Cneg =              f4CenterOriginA_Cneg * boxA.y - boxC.y;
		fltx4 f4CenterZ0_Cpos = boxCenterZ + boxC.z + f4CenterOriginA_Cpos * boxA.z, f4CenterZ0_Cneg = boxCenterZ + f4CenterOriginA_Cneg * boxA.z - boxC.z;

		// 4/3 (3 x0 z0 + xA zA + xB zB) type of integral : take x0 z0 + (xA zA + xB zB) / 3
		// xA zA + xB zB is the common part

		//fltx4 f4Center_X_common = Four_Thirds * (boxA.x * boxA.z + boxM.x * boxM.z );
		fltx4 boxMxz = boxM.x * boxM.z, boxAxz = boxA.x * boxA.z;
		fltx4 f4Center_X_Cpos = Four_Fours * MaddSIMD( f4CenterX0_Cpos, f4CenterZ0_Cpos, Four_Thirds * MaddSIMD( boxAxz, f4CenterRangeSqr_Cpos, boxMxz ) ); 
		fltx4 f4Center_X_Cneg = Four_Fours * MaddSIMD( f4CenterX0_Cneg, f4CenterZ0_Cneg, Four_Thirds * MaddSIMD( boxAxz, f4CenterRangeSqr_Cneg, boxMxz ) ); 

		// y is the same as x

		//fltx4 f4Center_Y_common = Four_Thirds * (boxA.y * boxA.z + boxM.y * boxM.z );
		fltx4 boxMyz = boxM.y * boxM.z, boxAyz  = boxA.y * boxA.z;
		fltx4 f4Center_Y_Cpos = Four_Fours * MaddSIMD( f4CenterY0_Cpos, f4CenterZ0_Cpos, Four_Thirds * MaddSIMD(boxAyz, f4CenterRangeSqr_Cpos, boxMyz ) ); 
		fltx4 f4Center_Y_Cneg = Four_Fours * MaddSIMD( f4CenterY0_Cneg, f4CenterZ0_Cneg, Four_Thirds * MaddSIMD(boxAyz, f4CenterRangeSqr_Cneg, boxMyz ) ); 

		// z is a bit different: 2/3 (3 z0^2 + zA^2 + zB^2) 
		// so we can just add the difference of 4 * boxCenterZ * boxC.z to get from Cneg to Cpos
		//fltx4 f4Center_Z_common = Four_TwoThirds * ( Sqr( boxA.z ) + Sqr( boxM.z ) );
		fltx4 boxMzz = boxM.z * boxM.z, boxAzz  = boxA.z * boxA.z;
		fltx4 f4Center_Z_Cpos = Four_Twos * MaddSIMD( f4CenterZ0_Cpos, f4CenterZ0_Cpos, Four_Thirds * MaddSIMD( boxAzz, f4CenterRangeSqr_Cpos, boxMzz ) );
		fltx4 f4Center_Z_Cneg = Four_Twos * MaddSIMD( f4CenterZ0_Cneg, f4CenterZ0_Cneg, Four_Thirds * MaddSIMD( boxAzz, f4CenterRangeSqr_Cneg, boxMzz ) );

		fltx4 f4Center_W_Cpos = Four_Fours * f4CenterZ0_Cpos, f4Center_W_Cneg = Four_Fours * f4CenterZ0_Cneg;

#ifdef _DEBUG
		fltx4 f4CenterVolume = Dot3SIMD( f4Center_W_Cpos, f4CenterProj_Cpos ) - Dot3SIMD( f4Center_W_Cneg, f4CenterProj_Cneg );
		fltx4 f4CenterLeverX = Dot3SIMD( f4Center_X_Cpos, f4CenterProj_Cpos ) - Dot3SIMD( f4Center_X_Cneg, f4CenterProj_Cneg );
		fltx4 f4CenterLeverY = Dot3SIMD( f4Center_Y_Cpos, f4CenterProj_Cpos ) - Dot3SIMD( f4Center_Y_Cneg, f4CenterProj_Cneg );
		fltx4 f4CenterLeverZ = Dot3SIMD( f4Center_Z_Cpos, f4CenterProj_Cpos ) - Dot3SIMD( f4Center_Z_Cneg, f4CenterProj_Cneg );
		
		// this is the condenced result of previous integration
		fltx4 f4CenterComponent = CombineSIMD( f4CenterLeverX, f4CenterLeverY, f4CenterLeverZ, f4CenterVolume );(void)f4CenterComponent;
#endif
	//
	//
	//////////////////////////////////////////////////////////////////////////
	//	  Computing triangle components
	//
	//    If top triangle is selected , Center and bottom tri are ignored and top tri is subtracted from "Full" side integrals

	// top triangle starts with the top vertex, spanning 0..-2*min(water,cut) along A and 0..-2*min(water,cut)/cut along B
	// the isTopTri_* selectors will select the top tris out if appropriate
	bi32x4 isCutLarge = CmpGtSIMD( f4CutPart, f4Epsilon ); // is the triangle part large enough to even consider it? in most cases it is
	bi32x4 isTopTri_Cpos = AndSIMD( CmpLeSIMD( f4WaterPart_Cpos, f4CutPart ), isCutLarge ), isTopTri_Cneg = AndSIMD( CmpLeSIMD( f4WaterPart_Cneg, f4CutPart ), isCutLarge );
	//fltx4 isBotTri_Cpos = AndNotSIMD( isTopTri_Cpos, isCutLarge ), isBotTri_Cneg = AndNotSIMD( isTopTri_Cneg, isCutLarge );
	// integrate above-water part
	fltx4 rcpCutPart = AndSIMD( ReciprocalSIMD( f4CutPart ), isCutLarge ); // when this is Inf, isCutLarge will select it off

	fltx4 f4WaterInTop_Cpos = MaxSIMD( Four_Zeros, MinSIMD( f4CutPart, f4WaterPart_Cpos ) );
	fltx4 f4WaterInTop_Cneg = MaxSIMD( Four_Zeros, MinSIMD( f4CutPart, f4WaterPart_Cneg ) ); // when water is below the tri, it'll actually be selected off, so the min(cut,water) isn't needed here really
	FourVectors boxTopTriB_Cpos = boxB * ( f4WaterInTop_Cpos * rcpCutPart ), boxTopTriB_Cneg = boxB * ( f4WaterInTop_Cneg * rcpCutPart );
	FourVectors boxTopTriA_Cpos = boxA * f4WaterInTop_Cpos, boxTopTriA_Cneg = boxA * f4WaterInTop_Cneg;
	fltx4 f4TopTriProj_Cpos = fabs( CrossZ( boxTopTriA_Cpos, boxTopTriB_Cpos ) ), f4TopTriProj_Cneg = fabs( CrossZ( boxTopTriA_Cneg, boxTopTriB_Cneg ) );

	fltx4 f4WaterInBot_common = Four_Ones + f4CutPart, f4CutPart_neg = -f4CutPart;
// 	fltx4 f4WaterInBot_Cpos = MaxSIMD( Four_Zeros, MinSIMD( f4CutPart, f4WaterInBot_common - f4WaterPart_Cpos ) );
// 	fltx4 f4WaterInBot_Cneg = MaxSIMD( Four_Zeros, MinSIMD( f4CutPart, f4WaterInBot_common - f4WaterPart_Cneg ) );
	fltx4 f4WaterInBot_Cpos_neg = MinSIMD( Four_Zeros, MaxSIMD( f4CutPart_neg, f4WaterPart_Cpos - f4WaterInBot_common) );
	fltx4 f4WaterInBot_Cneg_neg = MinSIMD( Four_Zeros, MaxSIMD( f4CutPart_neg, f4WaterPart_Cneg - f4WaterInBot_common) );

	// update: (looks like) for the bottom triangle, we need to integrate (0..+2) and (0..+2) in positive triangle, so we'll just need to flip
	//         the signs for the bottom triangle A and B vectors
	FourVectors boxBotTriB_Cpos = boxB * ( f4WaterInBot_Cpos_neg * rcpCutPart ), boxBotTriB_Cneg = boxB * ( f4WaterInBot_Cneg_neg * rcpCutPart );
	FourVectors boxBotTriA_Cpos = boxA * f4WaterInBot_Cpos_neg, boxBotTriA_Cneg = boxA * f4WaterInBot_Cneg_neg;
	fltx4 f4BotTriProj_Cpos = fabs( CrossZ( boxBotTriA_Cpos, boxBotTriB_Cpos ) ), f4BotTriProj_Cneg = fabs( CrossZ( boxBotTriA_Cneg, boxBotTriB_Cneg ) );

	// let's integrate along topTriA (0..-2) and topTriB (0..-2), a triangle . Here's the solved integral:
	// 	2/3 (xA (-2 z0 + 2 zA + zB) + xB (-2 z0 + zA + 2 zB) + x0 (3 z0 - 2 (zA + zB))), 
	// 	2/3 (yA (-2 z0 + 2 zA + zB) + yB (-2 z0 + zA + 2 zB) + y0 (3 z0 - 2 (zA + zB))), 
	// 	1/3 (3 z0^2 - 4 z0 (zA + zB) + 2 (zA^2 + zA zB + zB^2)), 
	// 	2/3 (3 z0 - 2 (zA + zB))
	//
	// here's collected by x0,y0,z0
	// 	2/3 (-2 xA - 2 xB) z0 + 2/3 (2 xA zA + xB zA + xA zB + 2 xB zB) + x0 (2 z0 - (4 (zA + zB))/3), 
	// 	2/3 (-2 yA - 2 yB) z0 + 2/3 (2 yA zA + yB zA + yA zB + 2 yB zB) + y0 (2 z0 - (4 (zA + zB))/3),
	// 	z0^2 - 4/3 z0 (zA + zB) + 2/3 (zA^2 + zA zB + zB^2),
	// 	2 z0 - (4 (zA + zB))/3

	// x0,y0,z0 are the boxTopZ for Cpos, and boxTopZ - 2 C for Cneg
	fltx4 f4TopTriX0_Cneg = MsubSIMD( Four_Twos, boxC.x, boxTopX );
	fltx4 f4TopTriY0_Cneg = MsubSIMD( Four_Twos, boxC.y, boxTopY );
	fltx4 f4TopTriZ0_Cneg = MsubSIMD( Four_Twos, boxC.z, boxTopZabs );

	fltx4 f4TopTri_X_Cpos = Four_TwoThirds * (boxTopTriA_Cpos.x * ( Four_Twos * ( boxTopTriA_Cpos.z - boxTopZabs ) + boxTopTriB_Cpos.z ) + 
		boxTopTriB_Cpos.x * ( boxTopTriA_Cpos.z + 
		Four_Twos * ( boxTopTriB_Cpos.z - boxTopZabs ) ) + 
		boxTopX * (Four_Threes * boxTopZabs - Four_Twos * ( boxTopTriA_Cpos.z + boxTopTriB_Cpos.z ) ) );

	fltx4 f4TopTri_Y_Cpos = Four_TwoThirds * (boxTopTriA_Cpos.y * ( Four_Twos * ( boxTopTriA_Cpos.z - boxTopZabs ) + boxTopTriB_Cpos.z ) + 
		boxTopTriB_Cpos.y * ( boxTopTriA_Cpos.z + 
		Four_Twos * ( boxTopTriB_Cpos.z - boxTopZabs ) ) + 
		boxTopY * (Four_Threes * boxTopZabs - Four_Twos * ( boxTopTriA_Cpos.z + boxTopTriB_Cpos.z ) ) );

	fltx4 f4TopTri_Z_Cpos = Four_Thirds * (Four_Threes * boxTopZabs * boxTopZabs - 
		Four_Fours * boxTopZabs * (boxTopTriA_Cpos.z + boxTopTriB_Cpos.z) + 
		Four_Twos * (boxTopTriA_Cpos.z * boxTopTriA_Cpos.z + 
		boxTopTriA_Cpos.z * boxTopTriB_Cpos.z + boxTopTriB_Cpos.z*boxTopTriB_Cpos.z));

	fltx4 f4TopTri_W_Cpos = Four_TwoThirds * ( Four_Threes * boxTopZabs - Four_Twos * ( boxTopTriA_Cpos.z + boxTopTriB_Cpos.z ) );

	fltx4 f4TopTri_X_Cneg = Four_TwoThirds * (boxTopTriA_Cneg.x * ( Four_Twos * ( boxTopTriA_Cneg.z - f4TopTriZ0_Cneg ) + boxTopTriB_Cneg.z ) + 
		boxTopTriB_Cneg.x * ( boxTopTriA_Cneg.z + 
		Four_Twos * ( boxTopTriB_Cneg.z - f4TopTriZ0_Cneg ) ) + 
		f4TopTriX0_Cneg * (Four_Threes * f4TopTriZ0_Cneg - Four_Twos * ( boxTopTriA_Cneg.z + boxTopTriB_Cneg.z ) ) );

	fltx4 f4TopTri_Y_Cneg = Four_TwoThirds * (boxTopTriA_Cneg.y * ( Four_Twos * ( boxTopTriA_Cneg.z - f4TopTriZ0_Cneg ) + boxTopTriB_Cneg.z ) + 
		boxTopTriB_Cneg.y * ( boxTopTriA_Cneg.z + 
		Four_Twos * ( boxTopTriB_Cneg.z - f4TopTriZ0_Cneg ) ) + 
		f4TopTriY0_Cneg * (Four_Threes * f4TopTriZ0_Cneg - Four_Twos * ( boxTopTriA_Cneg.z + boxTopTriB_Cneg.z ) ) );

	fltx4 f4TopTri_Z_Cneg = Four_Thirds * (Four_Threes * f4TopTriZ0_Cneg * f4TopTriZ0_Cneg - 
		Four_Fours * f4TopTriZ0_Cneg * (boxTopTriA_Cneg.z + boxTopTriB_Cneg.z) + 
		Four_Twos * (boxTopTriA_Cneg.z * boxTopTriA_Cneg.z + 
		boxTopTriA_Cneg.z * boxTopTriB_Cneg.z + boxTopTriB_Cneg.z*boxTopTriB_Cneg.z));

	fltx4 f4TopTri_W_Cneg = Four_TwoThirds * ( Four_Threes * f4TopTriZ0_Cneg - Four_Twos * ( boxTopTriA_Cneg.z + boxTopTriB_Cneg.z ) );
	
	
	fltx4 f4BotTriX0_Cpos = boxC.x - boxA.x - boxB.x;
	fltx4 f4BotTriY0_Cpos = boxC.y - boxA.y - boxB.y;
	fltx4 f4BotTriZ0_Cpos = boxC.z - boxA.z - boxB.z + boxCenterZ;

	fltx4 f4BotTri_X_Cpos = Four_TwoThirds * (boxBotTriA_Cpos.x * ( Four_Twos * ( boxBotTriA_Cpos.z - f4BotTriZ0_Cpos ) + boxBotTriB_Cpos.z ) + 
		boxBotTriB_Cpos.x * ( boxBotTriA_Cpos.z + 
		Four_Twos * ( boxBotTriB_Cpos.z - f4BotTriZ0_Cpos ) ) + 
		f4BotTriX0_Cpos * (Four_Threes * f4BotTriZ0_Cpos - Four_Twos * ( boxBotTriA_Cpos.z + boxBotTriB_Cpos.z ) ) );

	fltx4 f4BotTri_Y_Cpos = Four_TwoThirds * (boxBotTriA_Cpos.y * ( Four_Twos * ( boxBotTriA_Cpos.z - f4BotTriZ0_Cpos ) + boxBotTriB_Cpos.z ) + 
		boxBotTriB_Cpos.y * ( boxBotTriA_Cpos.z + 
		Four_Twos * ( boxBotTriB_Cpos.z - f4BotTriZ0_Cpos ) ) + 
		f4BotTriY0_Cpos * (Four_Threes * f4BotTriZ0_Cpos - Four_Twos * ( boxBotTriA_Cpos.z + boxBotTriB_Cpos.z ) ) );

	fltx4 f4BotTri_Z_Cpos = Four_Thirds * (Four_Threes * f4BotTriZ0_Cpos * f4BotTriZ0_Cpos - 
		Four_Fours * f4BotTriZ0_Cpos * (boxBotTriA_Cpos.z + boxBotTriB_Cpos.z) + 
		Four_Twos * (boxBotTriA_Cpos.z * boxBotTriA_Cpos.z + 
		boxBotTriA_Cpos.z * boxBotTriB_Cpos.z + boxBotTriB_Cpos.z*boxBotTriB_Cpos.z));

	fltx4 f4BotTri_W_Cpos = Four_TwoThirds * ( Four_Threes * f4BotTriZ0_Cpos - Four_Twos * ( boxBotTriA_Cpos.z + boxBotTriB_Cpos.z ) );

	fltx4 f4BotTriZ0_Cneg = boxCenterZ - boxTopZrel;
	fltx4 f4BotTri_X_Cneg = Four_TwoThirds * (boxBotTriA_Cneg.x * ( Four_Twos * ( boxBotTriA_Cneg.z - f4BotTriZ0_Cneg ) + boxBotTriB_Cneg.z ) + 
		boxBotTriB_Cneg.x * ( boxBotTriA_Cneg.z + 
		Four_Twos * ( boxBotTriB_Cneg.z - f4BotTriZ0_Cneg ) ) 
		-boxTopX * (Four_Threes * f4BotTriZ0_Cneg - Four_Twos * ( boxBotTriA_Cneg.z + boxBotTriB_Cneg.z ) ) );

	fltx4 f4BotTri_Y_Cneg = Four_TwoThirds * (boxBotTriA_Cneg.y * ( Four_Twos * ( boxBotTriA_Cneg.z - f4BotTriZ0_Cneg ) + boxBotTriB_Cneg.z ) + 
		boxBotTriB_Cneg.y * ( boxBotTriA_Cneg.z + 
		Four_Twos * ( boxBotTriB_Cneg.z - f4BotTriZ0_Cneg ) ) 
		-boxTopY * (Four_Threes * f4BotTriZ0_Cneg - Four_Twos * ( boxBotTriA_Cneg.z + boxBotTriB_Cneg.z ) ) );

	fltx4 f4BotTri_Z_Cneg = Four_Thirds * (Four_Threes * f4BotTriZ0_Cneg * f4BotTriZ0_Cneg - 
		Four_Fours * f4BotTriZ0_Cneg * (boxBotTriA_Cneg.z + boxBotTriB_Cneg.z) + 
		Four_Twos * (boxBotTriA_Cneg.z * boxBotTriA_Cneg.z + 
		boxBotTriA_Cneg.z * boxBotTriB_Cneg.z + boxBotTriB_Cneg.z*boxBotTriB_Cneg.z));

	fltx4 f4BotTri_W_Cneg = Four_TwoThirds * ( Four_Threes * f4BotTriZ0_Cneg - Four_Twos * ( boxBotTriA_Cneg.z + boxBotTriB_Cneg.z ) );

	fltx4 f4All_X_Cpos = MaskedAssign( isTopTri_Cpos, f4SideProj * f4Full_X_Cpos - f4TopTriProj_Cpos * f4TopTri_X_Cpos, f4BotTriProj_Cpos * f4BotTri_X_Cpos + f4CenterProj_Cpos * f4Center_X_Cpos );
	fltx4 f4All_X_Cneg = MaskedAssign( isTopTri_Cneg, f4SideProj * f4Full_X_Cneg - f4TopTriProj_Cneg * f4TopTri_X_Cneg, f4BotTriProj_Cneg * f4BotTri_X_Cneg + f4CenterProj_Cneg * f4Center_X_Cneg );
	fltx4 f4All_Y_Cpos = MaskedAssign( isTopTri_Cpos, f4SideProj * f4Full_Y_Cpos - f4TopTriProj_Cpos * f4TopTri_Y_Cpos, f4BotTriProj_Cpos * f4BotTri_Y_Cpos + f4CenterProj_Cpos * f4Center_Y_Cpos );
	fltx4 f4All_Y_Cneg = MaskedAssign( isTopTri_Cneg, f4SideProj * f4Full_Y_Cneg - f4TopTriProj_Cneg * f4TopTri_Y_Cneg, f4BotTriProj_Cneg * f4BotTri_Y_Cneg + f4CenterProj_Cneg * f4Center_Y_Cneg );
	fltx4 f4All_Z_Cpos = MaskedAssign( isTopTri_Cpos, f4SideProj * f4Full_Z_Cpos - f4TopTriProj_Cpos * f4TopTri_Z_Cpos, f4BotTriProj_Cpos * f4BotTri_Z_Cpos + f4CenterProj_Cpos * f4Center_Z_Cpos );
	fltx4 f4All_Z_Cneg = MaskedAssign( isTopTri_Cneg, f4SideProj * f4Full_Z_Cneg - f4TopTriProj_Cneg * f4TopTri_Z_Cneg, f4BotTriProj_Cneg * f4BotTri_Z_Cneg + f4CenterProj_Cneg * f4Center_Z_Cneg );
	fltx4 f4All_W_Cpos = MaskedAssign( isTopTri_Cpos, f4SideProj * f4Full_W_Cpos - f4TopTriProj_Cpos * f4TopTri_W_Cpos, f4BotTriProj_Cpos * f4BotTri_W_Cpos + f4CenterProj_Cpos * f4Center_W_Cpos );
	fltx4 f4All_W_Cneg = MaskedAssign( isTopTri_Cneg, f4SideProj * f4Full_W_Cneg - f4TopTriProj_Cneg * f4TopTri_W_Cneg, f4BotTriProj_Cneg * f4BotTri_W_Cneg + f4CenterProj_Cneg * f4Center_W_Cneg );

	fltx4 f4All_X = Sum3SIMD( f4All_X_Cpos - f4All_X_Cneg );
	fltx4 f4All_Y = Sum3SIMD( f4All_Y_Cpos - f4All_Y_Cneg );

	// <Sergiy> to be brutally honest, I don't care about Z integral. It represents the Z of the lever of archimedes force, and
	// it affects neither force nor torque exerted by the said force. Not computing it here reduces this routine from 1188 ticks to 900 ticks per run
	(void)f4All_Z_Cpos;
	(void)f4All_Z_Cneg;
	fltx4 f4All_Z = Four_Zeros;//Sum3SIMD( f4All_Z_Cpos - f4All_Z_Cneg ); 
	
	fltx4 f4All_W = Sum3SIMD( f4All_W_Cpos - f4All_W_Cneg );
#if 1
	// <Sergiy> again, to be brutally honest, I don't care about the actual lever of archimedes force.
	// I can just as well use lever * displaced_volume to compute the torque, and it'll actually be more precise, although less understandable.
	// 

	// this variant returns XYZ of the center of mass of displaced fluid multiplied by W, and W = volume of displaced fluid
	fltx4 f4All = CombineSIMD( f4All_X, f4All_Y, f4All_Z, f4All_W ) + f4All_W * boxCenterXY;
#else
	// this variant returns XYZ of the center of mass of displaced fluid, and W = volume of displaced fluid
	fltx4 rcpAllW = ReciprocalSIMD( f4All_W );
	fltx4 f4All = SetWSIMD( CombineXYZ_Special( f4All_X, f4All_Y, f4All_Z ) * rcpAllW + boxCenterXY, f4All_W );
#endif
	return f4All;
}


/*
float GetBoxBuoyancyTest( const matrix3x4_t & tm )
{

}
*/

Vector4D GetPyramidBuoyancy( const Vector &pos, const Vector &a, const Vector &b, const Vector &n )
{
	Vector verts[5], verts2[10];
	uint numVerts = 4, numVerts2 = 0;
	verts[0] = pos + n + a + b;
	verts[1] = pos + n + a - b;
	verts[2] = pos + n - a - b;
	verts[3] = pos + n - a + b;

	Vector prevVert = verts[3];
	for ( uint i = 0; i < numVerts; ++i )
	{
		if ( prevVert.z * verts[i].z < 0 )
		{
			// switching sign
			float flFraction = prevVert.z / ( prevVert.z - verts[i].z );
			verts2[numVerts2] = prevVert * ( 1 - flFraction ) +  verts[i] * flFraction;
			Assert( fabs( verts2[numVerts2].z ) < 1e-5f );
			verts2[numVerts2].z = 0;
			numVerts2++;
		}
		prevVert = verts2[numVerts2++] = verts[i];
	}

	float flSum = 0, flSign = 1.0f;
	Vector vecCenter( 0, 0, 0 );
	Vector normal = CrossProduct( a, b );
	Assert( DotProduct( normal, n ) >= -1e-6f );
	if ( DotProduct( pos + n, normal ) < 0 ) // pos + n is the center of the face
	{
		flSign = -1.0f;
	}

	// exclude all z>0 verts
	for ( uint i = 0 ; i < numVerts2; )
	{
		if ( verts2[i].z > 0 )
		{
			for ( uint j = i + 1 ; j < numVerts2; ++j )
			{
				verts2[j-1] = verts2[j];
			}
			--numVerts2;
		}
		else
		{
			++i;
		}
	}

	Vector rootVert = verts2[0];

	for ( uint i = 1; i + 1 < numVerts2 ; ++i )
	{
		Vector curVert = verts2[i], nextVert = verts2[i+1];
		{
			// this segment is guaranteed to be under water
			float flElementVolume = DotProduct( CrossProduct( curVert, rootVert ), nextVert ) / 6;
			flElementVolume = fabs( flElementVolume );
			flSum += flElementVolume ;

			Vector vecElementCenter =  ( rootVert + curVert + nextVert ) * 0.25f;
			vecCenter += flElementVolume * vecElementCenter;
		}
	}

	Vector4D result;
#if 1
	result.Init( vecCenter * flSign, flSum * flSign );
#else
	result.Init( flSum > 1e-8f ? vecCenter / flSum : Vector( 0, 0, 0 ), flSum * flSign );
#endif
	return result;
}

/*Vector4D GetQuadBuoyancy( const Vector &pos, const Vector &a, const Vector &b, const Vector &n )
{
	Vector verts[4], verts2[10];
	uint numVerts = 4, numVerts2 = 0;

	Vector acrossb = CrossProduct( a, b );

	float flAreaXIntegral = acrossb.x * 4 * ( pos.x + n.x );
	float flAreaYIntegral = acrossb.y * 4 * ( pos.y + n.y );
	float flAreaZIntegral = acrossb.z * 4 * ( pos.z + n.z );
	
	Vector4D vecIntegral;
	vecIntegral.w = flAreaZIntegral;
	Vector center = pos + n;
	Assert(DotProduct(n, acrossb) > 0);

	float x0 = center.x, y0 = center.y, z0 = center.z;
	float xA = a.x, yA = a.y, zA = a.z;
	float xB = b.x, yB = b.y, zB = b.z;

	vecIntegral.Init(
		4* x0 *z0 + (xA* zA + xB*zB)/3,
		4* y0 *z0 + (yA* zA + yB*zB)/3,
		2* z0 *z0 + (zA* zA + zB*zB)/3,
		4* z0);

	return vecIntegral * acrossb.z;
}  */


inline void Swap(Vector&a, Vector&b)
{
	Vector t = a;
	a = b;
	b = t;
}

/*
Vector4D GetBuoyancy( const Vector &pos, Vector box[3] )
{
	float rcpZ[3];
	for(int i = 0; i < 3; ++i)
	{
		if( box[i].z < 0 )
			box[i] = -box[i];
		for(int j = 0; j < i; ++j)
		{
			if(box[j].z < box[i].z)
				Swap(box[i], box[j]);
		}
	}
	for(int i = 0; i < 3; ++i)
		rcpZ[i] = box[i].z > 1e-7f? 1 / box[i].z : 0;


	uint numVerts = 4, numVerts2 = 0;

	Vector acrossb = CrossProduct( a, b );

	float flAreaXIntegral = acrossb.x * 4 * ( pos.x + n.x );
	float flAreaYIntegral = acrossb.y * 4 * ( pos.y + n.y );
	float flAreaZIntegral = acrossb.z * 4 * ( pos.z + n.z );

	Vector4D vecIntegral;
	vecIntegral.w = flAreaZIntegral;
	Vector center = pos + n;
	Assert(DotProduct(n, acrossb) > 0);

	float x0 = center.x, y0 = center.y, z0 = center.z;
	float xA = a.x, yA = a.y, zA = a.z;
	float xB = b.x, yB = b.y, zB = b.z;

	vecIntegral.Init(
		4* x0 *z0 + (xA* zA + xB*zB)/3,
		4* y0 *z0 + (yA* zA + yB*zB)/3,
		2* z0 *z0 + (zA* zA + zB*zB)/3,
		4* z0);

	return vecIntegral * acrossb.z;
}*/


Vector4D operator % ( const Vector4D & a, const Vector4D & b )
{
	Vector4D ave;
	ave.Init( fabs( a.w + b.w ) > 1e-6f ? ( a.AsVector3D() * a.w + b.AsVector3D() * b.w ) / ( a.w + b.w ) : Vector( 0, 0, 0 ), a.w + b.w );
	return ave;
}


Vector4D GetBoxBuoyancy( const Vector& a, const Vector& b, const Vector& c, const Vector& pos )
{
	return GetPyramidBuoyancy( pos, a, b, c ) + GetPyramidBuoyancy( pos, b, a, -c ) + GetPyramidBuoyancy( pos, c, a, b ) + GetPyramidBuoyancy( pos, a, c, -b ) + GetPyramidBuoyancy( pos, b, c, a ) + GetPyramidBuoyancy( pos, c, b, -a );
}


void BenchmarkBoxBuoyancy( Vector a, const Vector& b, const Vector& c, const Vector& pos )
{
	int start, end;
	const int nIterations = 100000;
	Vector4D result;

	start = GetHardwareClockFast();
	result.Init(0,0,0,0);
	for ( int i = 0; i < nIterations; ++i )
	{
		result = result % (GetPyramidBuoyancy( pos, a, b, c ) % GetPyramidBuoyancy( pos, b, a, -c ) % GetPyramidBuoyancy( pos, c, a, b ) % GetPyramidBuoyancy( pos, a, c, -b ) % GetPyramidBuoyancy( pos, b, c, a ) % GetPyramidBuoyancy( pos, c, b, -a )) ;
		a += Vector(1e-24f, 1e-25f, 1e-26f);
	}
	end = GetHardwareClockFast();
	Msg( "Box Buoyancy Scalar Benchmark: %d ticks/box, volume %g \n", int32( ( end - start ) ) / nIterations, result.w / nIterations );
}


const Vector RotateZ( const Vector & in, float flDegrees )
{
	Vector res;
	VectorRotate( in, QAngle(0,flDegrees,0), res );
	return res;
}

const Vector RotateY( const Vector & in, float flDegrees )
{
	Vector res;
	VectorRotate( in, QAngle(flDegrees,0,0), res );
	return res;
}

const Vector Rotate( const Vector & in, const QAngle &a )
{
	Vector res;
	VectorRotate( in, a, res );
	return res;
}


struct Test_t
{
	void Test()
	{
		PermTest();
#ifdef _DEBUG
		BuoyancyTest();
#else
		Benchmark();
#endif
	}

	bool TestAllEqual( const fltx4 & a, const fltx4 & b )
	{
		return IsAllEqual( a, b );
	}

	void PermTest()
	{
#ifdef _DEBUG
		fltx4 f4Canonical = {0.125f, 1.125f, 2.125f, 3.125f};
		float flCanonical[4] = {0.125f, 1.125f, 2.125f, 3.125f};
		fltx4 f4CanonicalYXZW = {1.125f, 0.125f, 2.125f, 3.125f};
		fltx4 f4CanonicalXZYW = {0.125f, 2.125f, 1.125f, 3.125f};
		fltx4 f4CanonicalZYXW = {2.125f, 1.125f, 0.125f, 3.125f};
		fltx4 f4CanonicalXXYW = {0.125f, 0.125f, 1.125f, 3.125f};
		fltx4 f4CanonicalYZZW = {1.125f, 2.125f, 2.125f, 3.125f};

		Assert( TestAllEqual( f4Canonical, LoadUnalignedSIMD( flCanonical ) ) );

		for ( int i = 0; i < 4; ++i )
		{
			float flSubFloat = SubFloat( f4Canonical, i );
			Assert( fabs( flSubFloat - float( i ) - 0.125f ) < 1e-6f );
		}


		Assert( TestAllEqual( PermYXZW( f4Canonical ), ( f4CanonicalYXZW ) ) );
		Assert( TestAllEqual( PermXZYW( f4Canonical ), ( f4CanonicalXZYW ) ) );
		Assert( TestAllEqual( PermZYXW( f4Canonical ), ( f4CanonicalZYXW ) ) );
		Assert( TestAllEqual( PermXXYW( f4Canonical ), ( f4CanonicalXXYW ) ) );
		Assert( TestAllEqual( PermYZZW( f4Canonical ), f4CanonicalYZZW ) );
#endif
	}

	void BuoyancyTest()
	{
		Vector test[][3] =
		{
			{Vector( 1, 0, 0 ), Vector( 0, 0, 1 ), Vector( 0, 0, 0.0f )},
			{Vector( 1, 0, 1 ), Vector( -1, 0, 1 ), Vector( 0, 0, -0.5f )},
			{Vector( 0, 1, 1 ), Vector( 0, -1, 1 ), Vector( 0, 0, 0.0f )},
			{Vector( 0, 2, 2 ), Vector( 0, -2, 2 ), Vector( 0, 0, 0.0f )},
			{Vector( 5, 0, 5 ), Vector( -1, 0, 1 ), Vector( 0, 0, 0.0f )},
			{Vector( 2, 0, 1 ), Vector( -1, 0, 2 ), Vector( 0, 0, 0.0f )},
			{RotateZ(Vector( 1, 0, 1 ),45), RotateZ(Vector( -1, 0, 1 ),45), Vector( 0, 0, 0.0f )},
			{RotateZ(Vector( 1, 0, 1 ),30), RotateZ(Vector( -1, 0, 1 ),30), Vector( 0, 0, 0.5f )},
			{RotateZ(Vector( sqrtf(0.5f), 0, sqrtf(0.5f) ),45), RotateZ(Vector( 0, 1, 0 ),45), Vector( 0, 0, 0.5f )},
			{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,0)}, // unit cube with tips extended high/low
			{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,0.01f)}, // unit cube with tips extended high/low
			{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,0.25f)}, // unit cube with tips extended high/low
			{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,0.5f)}, // unit cube with tips extended high/low
			{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,-0.25f)}, // unit cube with tips extended high/low
			{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,-0.5f)}, // unit cube with tips extended high/low
			{Vector( 2, 1, 1 ), Vector( -1, 1, 1 ), Vector( 0, 0, 0.0f )},
			{Vector( 2, 1, 1 ), Vector( -1, 1, 1 ), Vector( 0, 0, 0.5f )},
			{Vector( 0, 2, 1 ).Normalized(), Vector( 1, -1, 2 ).Normalized(), Vector( 0, 0, 0 )},
			{Vector( -0.804987f, 0.250343f, -0.811212f ), Vector( 0.474009f, -0.625978f,-0.663551f ).Normalized(), Vector( 1, 0, 0 )}
		};

		float flMaxError = 0;
		for ( int nAttempt = 0, numAttempts = 1000000; nAttempt < numAttempts; ++nAttempt )
		{
			Vector a = RandomVector( -1, 1 ), c = RandomVector( -1, 1 ), b = CrossProduct( a, c ), pos = RandomVector( -2, 2 );
			c = CrossProduct( a, b ).Normalized() * RandomVector( 0, 1.75f ).x;
			if ( nAttempt < sizeof( test ) / sizeof( test[0] ) )
			{
				a = test[nAttempt][0];
				b = test[nAttempt][1];
				c = CrossProduct( a, b ).Normalized() /* a.Length()*/;
				pos = test[nAttempt][2];
			}
			//pos.x = 0;
			//pos.y = 0;
			//pos.z = 0;
			matrix3x4_t tm;
			tm.Init( a, b, c, pos );
			FourVectors box;
			box.LoadAndSwizzle( LoadUnalignedSIMD( &a ), LoadUnalignedSIMD( &b ), LoadUnalignedSIMD( &c ), LoadUnalignedSIMD( &pos ) );

			//fltx4 f4Result0 = GetBoxBuoyancy3x4( box );
			Vector4D result1 = GetBoxBuoyancy(a,b,c,pos);
			fltx4 f4ResultV2 = GetBoxBuoyancy3x4( box );
			fltx4 f4Residual = f4ResultV2 - LoadUnalignedSIMD( &result1 );
			float flError = sqrtf( SubFloat( Dot4SIMD( f4Residual, f4Residual ), 0 ) );
			if( flError > flMaxError )
			{
				flMaxError = flError;
				Msg( "%d. Error %g\n", nAttempt, flError);
			}

			Assert( IsAllGreaterThan( ReplicateX4( 1e-4f ), fabs( f4Residual ) ) );

			float flBoxVolume = a.Length() * b.Length() * c.Length() * 8; (void)(flBoxVolume); // debug only

			if ( ( nAttempt % ( numAttempts / 10 ) ) == 0 )
			{
				DevMsg( "." );
			}
		}
		DevMsg( "Buoyancy test completed, benchmarking\n" );
	}

	void Benchmark()
	{
		for ( int i = 0; i < 100; ++i )
		{
			Vector a = RandomVector( -1, 1 ), c = RandomVector( -1, 1 ), b = CrossProduct( a, c ), pos = RandomVector( -2, 2 );
			c = CrossProduct( a, b ).Normalized() * RandomVector( 0, 1.75f ).x;
			BenchmarkBoxBuoyancy4x3( LoadUnalignedSIMD( &a ), LoadUnalignedSIMD( &b ), LoadUnalignedSIMD( &c ), LoadUnalignedSIMD( &pos ) );
			BenchmarkBoxBuoyancy( a,b,c,pos );
		}
	}
};
static Test_t s_test;

void TestBuoyancy()
{
	s_test.Test();
}