#include "platform.h"
#include "box_buoyancy.h"
#include "mathlib/vector4d.h"
#include "hardware_clock_fast.h"
inline const Vector ToVector( const fltx4 & f4 ) { return Vector( SubFloat( f4, 0 ), SubFloat( f4, 1 ), SubFloat( f4, 2 ) ); }
#ifdef _X360
FORCEINLINE fltx4 PermYXZW( const fltx4 & a ) { return __vpermwi( a, 0x4B ); // 01001011b
} FORCEINLINE fltx4 PermXZYW( const fltx4 & a ) { return __vpermwi( a, 0x27 ); // 00100111b
} FORCEINLINE fltx4 PermZYXW( const fltx4 & a ) { return __vpermwi( a, 0x93 ); // 10010011b
} FORCEINLINE fltx4 PermXXYW( const fltx4 & a ) { return __vpermwi( a, 0x07 ); // 00000111b
} FORCEINLINE fltx4 PermYZZW( const fltx4 & a ) { return __vpermwi( a, 0x6B ); // 01101011b
} FORCEINLINE fltx4 Sum3SIMD( const fltx4 &a ) { return __vmsum3fp( a, Four_Ones ); } FORCEINLINE fltx4 CombineSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z, const fltx4 & w ) { fltx4 r0 = __vmrghw(x, z); fltx4 r1 = __vmrghw(y, w);
return __vmrghw(r0, r1); }
// Assumes Y(xbox),Z(PC) are splatted
FORCEINLINE fltx4 CombineXYZ_Special( const fltx4 & x, const fltx4 & y, const fltx4 & z ) { fltx4 r0 = __vmrghw(x, z); return __vmrghw(r0, y); }
#elif defined( _PS3 )
const int32 ALIGN16 g_SIMD_YXZW[4] ALIGN16_POST = { 0x04050607, 0x00010203, 0x08090A0B, 0x0C0D0E0F }; const int32 ALIGN16 g_SIMD_XZYW[4] ALIGN16_POST = { 0x00010203, 0x08090A0B, 0x04050607, 0x0C0D0E0F }; const int32 ALIGN16 g_SIMD_ZYXW[4] ALIGN16_POST = { 0x08090A0B, 0x04050607, 0x00010203, 0x0C0D0E0F }; const int32 ALIGN16 g_SIMD_XXYW[4] ALIGN16_POST = { 0x00010203, 0x00010203, 0x04050607, 0x0C0D0E0F }; const int32 ALIGN16 g_SIMD_YZZW[4] ALIGN16_POST = { 0x04050607, 0x08090A0B, 0x08090A0B, 0x0C0D0E0F };
FORCEINLINE fltx4 PermYXZW( const fltx4 & a ) { return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_YXZW ) ); } FORCEINLINE fltx4 PermXZYW( const fltx4 & a ) { return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_XZYW ) ); } FORCEINLINE fltx4 PermZYXW( const fltx4 & a ) { return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_ZYXW ) ); } FORCEINLINE fltx4 PermXXYW( const fltx4 & a ) { return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_XXYW ) ); } FORCEINLINE fltx4 PermYZZW( const fltx4 & a ) { return vec_perm( a, a, (vec_uchar16)LoadAlignedIntSIMD( g_SIMD_YZZW ) ); } FORCEINLINE fltx4 Sum3SIMD( const fltx4 &a ) { return SplatXSIMD( a ) + SplatYSIMD( a ) + SplatZSIMD( a ); }
const int32 ALIGN16 g_SIMD_XAXA[4] ALIGN16_POST = { 0x00010203, 0x10111213, 0x00010203, 0x10111213 }; const int32 ALIGN16 g_SIMD_XYAB[4] ALIGN16_POST = { 0x00010203, 0x10111213, 0x00010203, 0x10111213 }; FORCEINLINE fltx4 CombineSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z, const fltx4 & w ) { //fltx4 xy = vec_perm(x, y, LoadAlignedIntSIMD( g_SIMD_XAXA ) );
//fltx4 zw = vec_perm(z, w, LoadAlignedIntSIMD( g_SIMD_XAXA ) );
fltx4 xzxz = vec_mergeh(x, z); fltx4 ywyw = vec_mergeh(y, w);
return vec_mergeh(xzxz, ywyw); }
// Assumes Y(xbox),Z(PC) are splatted
FORCEINLINE fltx4 CombineXYZ_Special( const fltx4 & x, const fltx4 & y, const fltx4 & z ) { fltx4 r0 = vec_mergeh(x, z); return vec_mergeh(r0, y); } #else
FORCEINLINE fltx4 PermYXZW( const fltx4 & a ) { return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 2, 0, 1 ) ); } FORCEINLINE fltx4 PermXZYW( const fltx4 & a ) { return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 1, 2, 0 ) ); } FORCEINLINE fltx4 PermZYXW( const fltx4 & a ) { return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 0, 1, 2 ) ); } FORCEINLINE fltx4 PermXXYW( const fltx4 & a ) { return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 1, 0, 0 ) ); }
FORCEINLINE fltx4 PermYZZW( const fltx4 & a ) { return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 2, 2, 1 ) ); } FORCEINLINE fltx4 Sum3SIMD( const fltx4 &a ) { return SplatXSIMD( a ) + SplatYSIMD( a ) + SplatZSIMD( a ); } FORCEINLINE fltx4 CombineSIMD( const fltx4 & row0, const fltx4 & row1, const fltx4 & row2, const fltx4 & row3 ) { fltx4 tmp0 = _mm_shuffle_ps( row0, row1, 0x44); fltx4 tmp1 = _mm_shuffle_ps( row2, row3, 0x44); return _mm_shuffle_ps(tmp0, tmp1, 0x88); }
// Assumes Y(xbox),Z(PC) are splatted
FORCEINLINE fltx4 CombineXYZ_Special( const fltx4 & x, const fltx4 & y, const fltx4 & z ) { fltx4 tmp0 = _mm_shuffle_ps( x, y, 0x44); return _mm_shuffle_ps(tmp0, z, 0x88); }
fltx4 GetBoxBuoyancy4x3( const fltx4& f4a, const fltx4& f4b, const fltx4&f4c, const fltx4&f4Origin ) { FourVectors box; box.LoadAndSwizzle( f4a, f4b, f4c, f4Origin ); return GetBoxBuoyancy3x4( box ); }
void BenchmarkBoxBuoyancy4x3( const fltx4& f4a, const fltx4& f4b, const fltx4&f4c, const fltx4&f4Origin ) { FourVectors box; box.LoadAndSwizzle( f4a, f4b, f4c, f4Origin ); fltx4 result = {0, 0, 0, 0};
int start, end; const int nIterations = 1000000;
start = GetHardwareClockFast(); for ( int i = 0; i < nIterations; ++i ) { result = result + GetBoxBuoyancy3x4( box ); box.x = AndSIMD( box.x, box.x ); } end = GetHardwareClockFast(); Msg( "Box Buoyancy 4x3 Benchmark: %d ticks/box, volume %g \n", int32( ( end - start ) ) / nIterations, SubFloat( result, 3 ) / nIterations ); }
inline fltx4 operator - ( const fltx4 & a, const fltx4 & b ) { return SubSIMD( a, b ); }
inline fltx4 operator + ( const fltx4 & a, const fltx4 & b ) { return AddSIMD( a, b ); }
inline fltx4 operator * ( const fltx4 & a, const fltx4 & b ) { return MulSIMD( a, b ); } */
inline fltx4 Bound( const fltx4 & a, const fltx4 &low, const fltx4 &high ) { return MinSIMD( MaxSIMD( a, low ), high ); }
inline fltx4 Limit01( const fltx4 & a ) { return MinSIMD( MaxSIMD( Four_Zeros, a ), Four_Ones ); }
const fltx4 Four_One6th = { 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f }; const fltx4 Four_One4th = { 0.25f, 0.25f, 0.25f, 0.25f }; const fltx4 Four_One12th = { 1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f };
// integral A .. 1 of : y (tipZ + (baseZ - tipZ) y) dy
inline fltx4 TriHelperIntegralAto1( const fltx4 &alpha, const fltx4 &tipZ, const fltx4 &baseZ ) { return MaddSIMD( Four_Thirds, baseZ, MsubSIMD( alpha * alpha, ( MsubSIMD( Four_Thirds, alpha * ( tipZ - baseZ ), Four_PointFives * tipZ ) ), Four_One6th * tipZ ) ); }
// integral A .. 1 of : y ((tipZ + (baseZ - tipZ) y)^2) dy
inline fltx4 TriHelperZ2IntegralAto1( const fltx4 &alpha, const fltx4 &tipZ, const fltx4 &baseZ ) { fltx4 alphaSqr = alpha * alpha; fltx4 alphaMinus1 = alpha - Four_Ones, alphaMinus1Sqr = alphaMinus1 * alphaMinus1; return Four_One4th*( Four_Ones - alphaSqr ) * ( alphaSqr + Four_Ones ) * baseZ*baseZ + ( Four_One6th + alphaSqr*alpha * ( Four_PointFives * alpha - Four_TwoThirds ) )* baseZ *tipZ - alphaMinus1Sqr * alphaMinus1 * ( Four_One12th + Four_One4th * alpha ) * tipZ * tipZ; }
// integral A .. 1 of : y (tipZ + (baseZ - tipZ) y) * (tipX + (baseX - tipX) y) dy
inline fltx4 TriHelperZ2IntegralAto1( const fltx4 &alpha, const fltx4 &tipZ, const fltx4 &baseZ, const fltx4 &tipX, const fltx4 &baseX ) { fltx4 alphaSqr = alpha * alpha; fltx4 alphaMinus1 = alpha - Four_Ones, alphaMinus1Sqr = alphaMinus1 * alphaMinus1;
return ( alphaMinus1Sqr*tipX*( ( Four_Ones + alpha*( Four_Twos + Four_Threes*alpha ) )*baseZ + tipZ + ( Four_Twos - Four_Threes*alpha )*alpha*tipZ ) + baseX*( -Four_Threes*( alphaSqr*alphaSqr - Four_Ones )*baseZ + tipZ + alphaSqr*alpha*( Four_Threes*alpha - Four_Fours )*tipZ ) ) * Four_One12th; }
// integral 0 .. B of : y (tipZ + (baseZ - tipZ) y) dy
inline fltx4 TriHelperIntegral0toB( const fltx4 &beta, const fltx4 &tipZ, const fltx4 &baseZ ) { return beta * beta * ( MaddSIMD( Four_Thirds, ( baseZ - tipZ ) * beta, Four_PointFives * tipZ ) ); }
double SubDbl( const fltx4& a, int i ) { return SubFloat( a, i ); } */
// integral 0 .. B of : y ((tipZ + (baseZ - tipZ) y)^2) dy
inline fltx4 TriHelperZ2Integral0toB( const fltx4 &beta, const fltx4 &tipZ, const fltx4 &baseZ ) { fltx4 dz = baseZ - tipZ; fltx4 bdz = beta * dz;
fltx4 f4Integral = beta * beta * ( Four_One4th * bdz * bdz + Four_TwoThirds * bdz * tipZ + Four_PointFives * tipZ * tipZ ); /*
double testIntegral[4]; for ( int i = 0; i < 4; ++i ) { testIntegral[i] = SubDbl( beta, i ) * SubDbl( beta, i ) * ( SubDbl( Four_One4th, i ) * SubDbl( bdz, i ) * SubDbl( bdz, i ) + SubDbl( Four_TwoThirds, i ) * SubDbl( bdz, i ) * SubDbl( tipZ, i ) + SubDbl( Four_PointFives, i ) * SubDbl( tipZ, i ) * SubDbl( tipZ, i ) ); } */ return f4Integral; }
// integral 0 .. B of : y (tipZ + (baseZ - tipZ) y) (tipX + (baseX - tipX) y ) dy
// note: baseX should be the center of the base coordinate
inline fltx4 TriHelperZ2Integral0toB( const fltx4 &beta, const fltx4 &tipZ, const fltx4 &baseZ, const fltx4 &tipX, const fltx4 &baseX ) { fltx4 dz = baseZ - tipZ, dx = baseX - tipX; fltx4 betaSqr = beta * beta;
fltx4 f4Integral = betaSqr * ( betaSqr * Four_One4th * dx * dz + Four_PointFives * tipX * tipZ + Four_Thirds * beta * ( baseZ * tipX + ( baseX - Four_Twos * tipX ) * tipZ ) ); return f4Integral; }
// this is 3 * Integral 0..1 of (z0+(z1-z0)y) dy
inline fltx4 TrplAvgSqrZ( const fltx4& z0, const fltx4 &z1 ) { return MaddSIMD( z0, z0 + z1, z1 * z1 ); }
inline fltx4 SixAvgSqrZX( const fltx4& z0, const fltx4 &z1, const fltx4& x0, const fltx4 &x1 ) { return x0 * MaddSIMD( Four_Twos, z0, z1 ) + x1 * MaddSIMD( Four_Twos, z1, z0 ) ; }
const fltx4 f4Epsilon = {1e-6f, 1e-6f, 1e-6f, 1e-6f};
inline FourVectors Cross( const FourVectors &a, const FourVectors &b ) { FourVectors ret; ret.x=MsubSIMD( a.z, b.y, MulSIMD( a.y, b.z ) ); ret.y=MsubSIMD( a.x, b.z, MulSIMD( a.z, b.x ) ); ret.z=MsubSIMD( a.y, b.x, MulSIMD( a.x, b.y ) ); return ret; }
inline fltx4 CrossZ( const FourVectors &a, const FourVectors &b ) { return MsubSIMD( a.y, b.x, MulSIMD( a.x, b.y ) ); }
inline fltx4 Sqr( const fltx4 &a ) { return a * a; }
inline FourVectors MsubSIMD( const FourVectors &a, const fltx4 &b, const FourVectors &c) // c-a*b
{ FourVectors ret; ret.x = MsubSIMD(a.x, b, c.x ); ret.y = MsubSIMD(a.y, b, c.y ); ret.z = MsubSIMD(a.z, b, c.z ); return ret; }
const fltx4 g_f4_11h4 = {1,1,0.5f,4.0f}; const fltx4 g_f4_4424 = {4,4,2,4}; const fltx4 g_f4AlmostInifiniteSlope = {1e+24,1e+24,1e+24,1e+24}; const int32 ALIGN16 g_SIMD_signmask_W[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0xFFFFFFFF }; const int32 ALIGN16 g_SIMD_signmask_NoW[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0 };
// physical interpretation: we're integrating the pressure force (pascals) along the submerged surface.
// in other words, we substitute the usual volume integral for surface integral
// Xbox360: 1250 cycles; Core2 Quad: 500 cycles; Core i7: ? cycles ; error: 2e-5
fltx4 GetBoxBuoyancy3x4( const FourVectors &box_in ) { FourVectors box; // sorted box
// make (a,b,c).z > 0
fltx4 f4SignMask = LoadAlignedSIMD( g_SIMD_signmask ); fltx4 signZ = AndSIMD( box_in.z, f4SignMask ); box.x = XorSIMD( box_in.x, signZ ); box.y = XorSIMD( box_in.y, signZ ); box.z = AndNotSIMD( f4SignMask, box_in.z ); fltx4 boxCenterZ = SplatWSIMD( box_in.z ); // the height of the center of the box above the water level
fltx4 boxCenterXY = AndSIMD( SetYSIMD( SplatWSIMD( box_in.x ), SplatWSIMD( box_in.y ) ), LoadAlignedSIMD( g_SIMD_SkipTailMask[2] ) ); // there are a lot of scheduling holes on this stage, so we might as well precompute something
// high point of the box, a+b+c
fltx4 boxTopX = Sum3SIMD( box.x ); fltx4 boxTopY = Sum3SIMD( box.y ); fltx4 boxTopZrel = Sum3SIMD( box.z ); fltx4 boxTopZabs = boxCenterZ + boxTopZrel, boxBotZ = boxCenterZ - boxTopZrel;
// sort a.z > b.z > c.z > 0; sorting takes 43 cycles on xbox360
bi32x4 swap_a_c = CmpLtSIMD( SplatXSIMD( box.z ), SplatZSIMD( box.z ) ); box.x = MaskedAssign( swap_a_c, PermZYXW( box.x ), box.x ); box.y = MaskedAssign( swap_a_c, PermZYXW( box.y ), box.y ); box.z = MaskedAssign( swap_a_c, PermZYXW( box.z ), box.z );
bi32x4 isBsmaller = CmpLtSIMD( SplatYSIMD( box.z ), box.z ); bi32x4 ordered_a_b = SplatXSIMD( isBsmaller ); // if a > b, they're ordered correctly
box.x = MaskedAssign( ordered_a_b, box.x, PermYXZW( box.x ) ); box.y = MaskedAssign( ordered_a_b, box.y, PermYXZW( box.y ) ); box.z = MaskedAssign( ordered_a_b, box.z, PermYXZW( box.z ) );
bi32x4 swap_b_c = SplatZSIMD( isBsmaller ); // if b < c, we need to swap them
box.x = MaskedAssign( swap_b_c, PermXZYW( box.x ), box.x ); box.y = MaskedAssign( swap_b_c, PermXZYW( box.y ), box.y ); box.z = MaskedAssign( swap_b_c, PermXZYW( box.z ), box.z );
Assert( SubFloat( box.z, 0 ) >= SubFloat( box.z, 1 ) && SubFloat( box.z, 1 ) >= SubFloat( box.z, 2 ) && SubFloat( box.z, 2 ) >= 0 );
// sorted and positive, time to integrate sides: (a,b) (a,c) (b,c)
// (a-b).z > (b-a).z, so the a+b, a-b, b-a, -a-b is the order of corners, top-to-bottom
FourVectors boxA, boxB; // these two represent a and b of each pair of edges defining
boxA.x = PermXXYW( box.x ); boxA.y = PermXXYW( box.y ); boxA.z = PermXXYW( box.z ); boxB.x = PermYZZW( box.x ); boxB.y = PermYZZW( box.y ); boxB.z = PermYZZW( box.z ); FourVectors boxC; // "c" maps to �c,b,a
boxC.x = PermZYXW( box.x ); boxC.y = PermZYXW( box.y ); boxC.z = PermZYXW( box.z );
// if a.z == 0 , b.z is also 0, so the whole rectangle is parallel to z=const
bi32x4 isSideFlat = CmpLtSIMD( boxA.z, f4Epsilon ); fltx4 rcpAz = AndNotSIMD( isSideFlat, ReciprocalSIMD( boxA.z ) ); fltx4 rcp2Az = Four_PointFives * rcpAz;
// the part of quad along a that's in the triangles cut by z=const surfaces
// this is the same regardless of C
// tab size must = 4 spaces for the ASCII art below to make sense
// * (a+b) cut = 0 a
// /| | ^
// / | | |
// (a-b) *--+ cut = f4CutPart | |
// | | | |
// | | | | > b=
// | | V | /
// +--* (b-a) cut = 1 cut, | /
// | / level, | /
// |/ fraction, | /
// (-a-b) * cut = 1 + f4CutPart etc. |/
// (a+b)-(a-b) 2b b
// computed as ----------- == -- == -
// (a+b)-(b-a) 2a a
fltx4 f4CutPart = MulSIMD( boxB.z, rcpAz ); // this must be between 0 (b is parallel to z=const) and 1 (a and b both have 45' slope)
Assert( IsAllGreaterThanOrEq( Four_Ones + f4Epsilon, SetWToZeroSIMD( f4CutPart ) ) && IsAllGreaterThanOrEq( f4CutPart + f4Epsilon, SetWToZeroSIMD( Four_Zeros ) ) ); //fltx4 rcpCutPart = AndSIMD( ReciprocalSIMD( f4CutPart ), CmpGtSIMD( f4CutPart, f4Epsilon ) );
// integrate the full sides of the box, multiplied by the XY projection areas
fltx4 f4SideProj = fabs( CrossZ( boxA, boxB ) );
// here's the center-of-mass and total volume integral solution:
// {{4/3 (3 x0 z0 + xA zA + xB zB), 4/3 (3 y0 z0 + yA zA + yB zB), 2/3 (3 z0^2 + zA^2 + zB^2), 4 z0},
// {1/24 (4 x0 (3 z0 + zA + zB) + xA (4 z0 + 2 zA + zB) + xB (4 z0 + zA + 2 zB)),
// 1/24 (4 y0 (3 z0 + zA + zB) + yA (4 z0 + 2 zA + zB) + yB (4 z0 + zA + 2 zB)),
// 1/24 (6 z0^2 + zA^2 + zA zB + zB^2 + 4 z0 (zA + zB)),
// 1/6 (3 z0 + zA + zB)}}
//fltx4 f4FullZ0_Cpos = boxCenterZ + boxC.z, f4FullZ0_Cneg = boxCenterZ - boxC.z;
// 4/3 (3 x0 z0 + xA zA + xB zB) type of integral : take x0 z0 + (xA zA + xB zB) / 3
// consider that x0 = � boxC.x and z0 = boxCenterZ � boxC.z, we're left with
// � boxCenter boxC.x + boxC.x boxC.z + (xA zA + xB zB) / 3
// Again, the only part that changes is (� boxCenterZ boxC.x)
fltx4 f4Full_X_common = boxC.x * boxC.z + Four_Thirds * ( boxA.x * boxA.z + boxB.x * boxB.z ); fltx4 f4Full_X_Cpos = Four_Fours * (boxCenterZ * boxC.x + f4Full_X_common); fltx4 f4Full_X_Cneg = Four_Fours * (f4Full_X_common - boxCenterZ * boxC.x);
// y is the same as x
fltx4 f4Full_Y_common = boxC.y * boxC.z + Four_Thirds * ( boxA.y * boxA.z + boxB.y * boxB.z ); fltx4 f4Full_Y_Cpos = Four_Fours * ( boxCenterZ * boxC.y + f4Full_Y_common ) ; fltx4 f4Full_Y_Cneg = Four_Fours * ( f4Full_Y_common - boxCenterZ * boxC.y ) ;
// z is different: 2/3 (3 z0^2 + zA^2 + zB^2) ; z0 = boxCenterZ � boxC.z,
// so we can just add the difference of 4 * boxCenterZ * boxC.z to get from Cneg to Cpos
fltx4 f4Full_Z_common = Four_TwoThirds * ( Sqr( boxA.z ) + Sqr( boxB.z ) ); fltx4 f4Full_Z_Cpos = MaddSIMD( Four_Twos, Sqr( boxCenterZ + boxC.z ), f4Full_Z_common ); fltx4 f4Full_Z_Cneg = MaddSIMD( Four_Twos, Sqr( boxCenterZ - boxC.z ), f4Full_Z_common );
fltx4 f4Full_W_Cpos = Four_Fours * ( boxCenterZ + boxC.z ), f4Full_W_Cneg = Four_Fours * ( boxCenterZ - boxC.z );
// this is how we'd compute the center of mass for fully-submerged cube, for validation
#ifdef _DEBUG
fltx4 f4TestVolume = Dot3SIMD( f4Full_W_Cpos - f4Full_W_Cneg, f4SideProj ); fltx4 f4TestSideProjDivVolume = f4SideProj * ReciprocalSIMD( f4TestVolume ); fltx4 f4TestLeverX = Dot3SIMD( f4Full_X_Cpos - f4Full_X_Cneg, f4TestSideProjDivVolume ), f4TestLeverY = Dot3SIMD( f4Full_Y_Cpos - f4Full_Y_Cneg, f4TestSideProjDivVolume ); fltx4 f4TestLeverZ = Dot3SIMD( f4Full_Z_Cpos - f4Full_Z_Cneg, f4TestSideProjDivVolume ); fltx4 f4TestResult = CombineSIMD( f4TestLeverX + SplatWSIMD(box_in.x), f4TestLeverY + SplatWSIMD(box_in.y), f4TestLeverZ, f4TestVolume ); (void)f4TestResult; #endif
// Computing Center parallelogram component of the full surface integral
// To compute the integral across the submerged part of each of 6 faces, we'll compute these components and then selectively sum them up
// to form the full integral: the top and bottom triangle.
// if the water level is intersecting top triangle ((a-b).z < 0) , we'll subtract top triangle integral from full integral
// if the water level is intersecting bottom triangle ((b-a).z < 0) , we'll select just the bottom triangle integral
// .. and we'll have to compute the middle part because it's not symmetrical ..
// .. on the second thought, we compute the center (parallelogram) , upper tri and lower tri
// for the center computation, we need the point of the middle of the center and m=b-ra parallel to the water
// waterTop is{ 0 = at V0 top; cut = at V1; 1 = at V2; 1+cut = at V3 bottom of the quad }
// waterBot is central-symmetrical, negative
// to find the fraction of right side of rectangle (the +b side) that has z=0
// this is different for +C and -C sides
// (a+b) � c + p a+b � c + p
// computed as -------------- == ------------ // note: � is typed by Alt + 0177
// (a+b)-(b-a) 2 a
// Warning: I take special care in cases of flat faces (z=const, when rcpAz is undefined)
// in these cases, submerged faces must have water<=0 and faces above water (z>0) must have water >= 1 + cut
// Note: If I take care not to compute fully-submerged or fully-above-water polytopes, I only need to check
// below-water case for Cneg faces and above-water case for Cpos faces
// The trick I'm using here to account for everything is perturb the face's slope slightly to effectively divide by epsilon
fltx4 rcp2AzSpecial = MaskedAssign( isSideFlat, g_f4AlmostInifiniteSlope, rcp2Az ); fltx4 f4WaterPart_Cpos = boxTopZabs * rcp2AzSpecial, f4WaterPart_Cneg = MaddSIMD( boxBotZ, rcp2AzSpecial, f4CutPart ) + Four_Ones; // on the central piece, we need to integrate along axes (a,m = b - cut*a) and ranges {-1+cut...max(-1+cut,1-max(w,cut)) , -1...1}
// even cut and w have the same denominator: it's cut=2b/2a and water=topZ/2a
//fltx4 f4HighLimit_Cpos = MaxSIMD( f4LowLimit, Four_Ones - MaxSIMD( f4WaterPart_Cpos, f4CutPart ) );
//fltx4 f4HighLimit_Cneg = MaxSIMD( f4LowLimit, Four_Ones - MaxSIMD( f4WaterPart_Cneg, f4CutPart ) );
fltx4 f4TopWaterInCenter_Cpos = MinSIMD( Four_Ones, MaxSIMD( f4CutPart, f4WaterPart_Cpos ) ); fltx4 f4TopWaterInCenter_Cneg = MinSIMD( Four_Ones, MaxSIMD( f4CutPart, f4WaterPart_Cneg ) ); // the range is full (1 means full span of the whole center parallelogram)
// but the origin is to be multiplied by A, so 1 means half of the length (-1 means 0 area)
fltx4 f4CenterRange_Cpos = Four_Ones - f4TopWaterInCenter_Cpos, f4CenterOriginA_Cpos = f4CutPart - f4TopWaterInCenter_Cpos; fltx4 f4CenterRange_Cneg = Four_Ones - f4TopWaterInCenter_Cneg, f4CenterOriginA_Cneg = f4CutPart - f4TopWaterInCenter_Cneg; // given the span (we're integrating from -span to +span), we can compute the center point for integration: ((r-1) + (1-max(w,r)))/2
// we can also compute the area of projection, because we reduce the area of the face by 1-max(r,w), i.e. by the span
fltx4 f4CenterProj_Cpos = f4SideProj * f4CenterRange_Cpos, f4CenterProj_Cneg = f4SideProj * f4CenterRange_Cneg; fltx4 f4CenterRangeSqr_Cpos = f4CenterRange_Cpos * f4CenterRange_Cpos; fltx4 f4CenterRangeSqr_Cneg = f4CenterRange_Cneg * f4CenterRange_Cneg;
// to integrate the central piece, we need the center point (pos�(c-a*q)), q = ; and m=b-cut a
// because it cancels out lots of terms in the integral
FourVectors boxM = MsubSIMD( boxA, f4CutPart, boxB ); // m=b-ra, replacement for b in the integrals
// here's the center-of-mass and total volume integral solution. M is our B in this case.
// {{4/3 (3 x0 z0 + xA zA + xM zM), 4/3 (3 y0 z0 + yA zA + yM zM), 2/3 (3 z0^2 + zA^2 + zM^2), 4 z0},
// and for triangles it would be this:
// {1/24 (4 x0 (3 z0 + zA + zM) + xA (4 z0 + 2 zA + zM) + xM (4 z0 + zA + 2 zM)),
// 1/24 (4 y0 (3 z0 + zA + zM) + yA (4 z0 + 2 zA + zM) + yM (4 z0 + zA + 2 zM)),
// 1/24 (6 z0^2 + zA^2 + zA zM + zM^2 + 4 z0 (zA + zM)),
// 1/6 (3 z0 + zA + zM)}}
// ... but we only use the rectangular integral right now
fltx4 f4CenterX0_Cpos = boxC.x + f4CenterOriginA_Cpos * boxA.x, f4CenterX0_Cneg = f4CenterOriginA_Cneg * boxA.x - boxC.x; fltx4 f4CenterY0_Cpos = boxC.y + f4CenterOriginA_Cpos * boxA.y, f4CenterY0_Cneg = f4CenterOriginA_Cneg * boxA.y - boxC.y; fltx4 f4CenterZ0_Cpos = boxCenterZ + boxC.z + f4CenterOriginA_Cpos * boxA.z, f4CenterZ0_Cneg = boxCenterZ + f4CenterOriginA_Cneg * boxA.z - boxC.z;
// 4/3 (3 x0 z0 + xA zA + xB zB) type of integral : take x0 z0 + (xA zA + xB zB) / 3
// xA zA + xB zB is the common part
//fltx4 f4Center_X_common = Four_Thirds * (boxA.x * boxA.z + boxM.x * boxM.z );
fltx4 boxMxz = boxM.x * boxM.z, boxAxz = boxA.x * boxA.z; fltx4 f4Center_X_Cpos = Four_Fours * MaddSIMD( f4CenterX0_Cpos, f4CenterZ0_Cpos, Four_Thirds * MaddSIMD( boxAxz, f4CenterRangeSqr_Cpos, boxMxz ) ); fltx4 f4Center_X_Cneg = Four_Fours * MaddSIMD( f4CenterX0_Cneg, f4CenterZ0_Cneg, Four_Thirds * MaddSIMD( boxAxz, f4CenterRangeSqr_Cneg, boxMxz ) );
// y is the same as x
//fltx4 f4Center_Y_common = Four_Thirds * (boxA.y * boxA.z + boxM.y * boxM.z );
fltx4 boxMyz = boxM.y * boxM.z, boxAyz = boxA.y * boxA.z; fltx4 f4Center_Y_Cpos = Four_Fours * MaddSIMD( f4CenterY0_Cpos, f4CenterZ0_Cpos, Four_Thirds * MaddSIMD(boxAyz, f4CenterRangeSqr_Cpos, boxMyz ) ); fltx4 f4Center_Y_Cneg = Four_Fours * MaddSIMD( f4CenterY0_Cneg, f4CenterZ0_Cneg, Four_Thirds * MaddSIMD(boxAyz, f4CenterRangeSqr_Cneg, boxMyz ) );
// z is a bit different: 2/3 (3 z0^2 + zA^2 + zB^2)
// so we can just add the difference of 4 * boxCenterZ * boxC.z to get from Cneg to Cpos
//fltx4 f4Center_Z_common = Four_TwoThirds * ( Sqr( boxA.z ) + Sqr( boxM.z ) );
fltx4 boxMzz = boxM.z * boxM.z, boxAzz = boxA.z * boxA.z; fltx4 f4Center_Z_Cpos = Four_Twos * MaddSIMD( f4CenterZ0_Cpos, f4CenterZ0_Cpos, Four_Thirds * MaddSIMD( boxAzz, f4CenterRangeSqr_Cpos, boxMzz ) ); fltx4 f4Center_Z_Cneg = Four_Twos * MaddSIMD( f4CenterZ0_Cneg, f4CenterZ0_Cneg, Four_Thirds * MaddSIMD( boxAzz, f4CenterRangeSqr_Cneg, boxMzz ) );
fltx4 f4Center_W_Cpos = Four_Fours * f4CenterZ0_Cpos, f4Center_W_Cneg = Four_Fours * f4CenterZ0_Cneg;
#ifdef _DEBUG
fltx4 f4CenterVolume = Dot3SIMD( f4Center_W_Cpos, f4CenterProj_Cpos ) - Dot3SIMD( f4Center_W_Cneg, f4CenterProj_Cneg ); fltx4 f4CenterLeverX = Dot3SIMD( f4Center_X_Cpos, f4CenterProj_Cpos ) - Dot3SIMD( f4Center_X_Cneg, f4CenterProj_Cneg ); fltx4 f4CenterLeverY = Dot3SIMD( f4Center_Y_Cpos, f4CenterProj_Cpos ) - Dot3SIMD( f4Center_Y_Cneg, f4CenterProj_Cneg ); fltx4 f4CenterLeverZ = Dot3SIMD( f4Center_Z_Cpos, f4CenterProj_Cpos ) - Dot3SIMD( f4Center_Z_Cneg, f4CenterProj_Cneg ); // this is the condenced result of previous integration
fltx4 f4CenterComponent = CombineSIMD( f4CenterLeverX, f4CenterLeverY, f4CenterLeverZ, f4CenterVolume );(void)f4CenterComponent; #endif
// Computing triangle components
// If top triangle is selected , Center and bottom tri are ignored and top tri is subtracted from "Full" side integrals
// top triangle starts with the top vertex, spanning 0..-2*min(water,cut) along A and 0..-2*min(water,cut)/cut along B
// the isTopTri_* selectors will select the top tris out if appropriate
bi32x4 isCutLarge = CmpGtSIMD( f4CutPart, f4Epsilon ); // is the triangle part large enough to even consider it? in most cases it is
bi32x4 isTopTri_Cpos = AndSIMD( CmpLeSIMD( f4WaterPart_Cpos, f4CutPart ), isCutLarge ), isTopTri_Cneg = AndSIMD( CmpLeSIMD( f4WaterPart_Cneg, f4CutPart ), isCutLarge ); //fltx4 isBotTri_Cpos = AndNotSIMD( isTopTri_Cpos, isCutLarge ), isBotTri_Cneg = AndNotSIMD( isTopTri_Cneg, isCutLarge );
// integrate above-water part
fltx4 rcpCutPart = AndSIMD( ReciprocalSIMD( f4CutPart ), isCutLarge ); // when this is Inf, isCutLarge will select it off
fltx4 f4WaterInTop_Cpos = MaxSIMD( Four_Zeros, MinSIMD( f4CutPart, f4WaterPart_Cpos ) ); fltx4 f4WaterInTop_Cneg = MaxSIMD( Four_Zeros, MinSIMD( f4CutPart, f4WaterPart_Cneg ) ); // when water is below the tri, it'll actually be selected off, so the min(cut,water) isn't needed here really
FourVectors boxTopTriB_Cpos = boxB * ( f4WaterInTop_Cpos * rcpCutPart ), boxTopTriB_Cneg = boxB * ( f4WaterInTop_Cneg * rcpCutPart ); FourVectors boxTopTriA_Cpos = boxA * f4WaterInTop_Cpos, boxTopTriA_Cneg = boxA * f4WaterInTop_Cneg; fltx4 f4TopTriProj_Cpos = fabs( CrossZ( boxTopTriA_Cpos, boxTopTriB_Cpos ) ), f4TopTriProj_Cneg = fabs( CrossZ( boxTopTriA_Cneg, boxTopTriB_Cneg ) );
fltx4 f4WaterInBot_common = Four_Ones + f4CutPart, f4CutPart_neg = -f4CutPart; // fltx4 f4WaterInBot_Cpos = MaxSIMD( Four_Zeros, MinSIMD( f4CutPart, f4WaterInBot_common - f4WaterPart_Cpos ) );
// fltx4 f4WaterInBot_Cneg = MaxSIMD( Four_Zeros, MinSIMD( f4CutPart, f4WaterInBot_common - f4WaterPart_Cneg ) );
fltx4 f4WaterInBot_Cpos_neg = MinSIMD( Four_Zeros, MaxSIMD( f4CutPart_neg, f4WaterPart_Cpos - f4WaterInBot_common) ); fltx4 f4WaterInBot_Cneg_neg = MinSIMD( Four_Zeros, MaxSIMD( f4CutPart_neg, f4WaterPart_Cneg - f4WaterInBot_common) );
// update: (looks like) for the bottom triangle, we need to integrate (0..+2) and (0..+2) in positive triangle, so we'll just need to flip
// the signs for the bottom triangle A and B vectors
FourVectors boxBotTriB_Cpos = boxB * ( f4WaterInBot_Cpos_neg * rcpCutPart ), boxBotTriB_Cneg = boxB * ( f4WaterInBot_Cneg_neg * rcpCutPart ); FourVectors boxBotTriA_Cpos = boxA * f4WaterInBot_Cpos_neg, boxBotTriA_Cneg = boxA * f4WaterInBot_Cneg_neg; fltx4 f4BotTriProj_Cpos = fabs( CrossZ( boxBotTriA_Cpos, boxBotTriB_Cpos ) ), f4BotTriProj_Cneg = fabs( CrossZ( boxBotTriA_Cneg, boxBotTriB_Cneg ) );
// let's integrate along topTriA (0..-2) and topTriB (0..-2), a triangle . Here's the solved integral:
// 2/3 (xA (-2 z0 + 2 zA + zB) + xB (-2 z0 + zA + 2 zB) + x0 (3 z0 - 2 (zA + zB))),
// 2/3 (yA (-2 z0 + 2 zA + zB) + yB (-2 z0 + zA + 2 zB) + y0 (3 z0 - 2 (zA + zB))),
// 1/3 (3 z0^2 - 4 z0 (zA + zB) + 2 (zA^2 + zA zB + zB^2)),
// 2/3 (3 z0 - 2 (zA + zB))
// here's collected by x0,y0,z0
// 2/3 (-2 xA - 2 xB) z0 + 2/3 (2 xA zA + xB zA + xA zB + 2 xB zB) + x0 (2 z0 - (4 (zA + zB))/3),
// 2/3 (-2 yA - 2 yB) z0 + 2/3 (2 yA zA + yB zA + yA zB + 2 yB zB) + y0 (2 z0 - (4 (zA + zB))/3),
// z0^2 - 4/3 z0 (zA + zB) + 2/3 (zA^2 + zA zB + zB^2),
// 2 z0 - (4 (zA + zB))/3
// x0,y0,z0 are the boxTopZ for Cpos, and boxTopZ - 2 C for Cneg
fltx4 f4TopTriX0_Cneg = MsubSIMD( Four_Twos, boxC.x, boxTopX ); fltx4 f4TopTriY0_Cneg = MsubSIMD( Four_Twos, boxC.y, boxTopY ); fltx4 f4TopTriZ0_Cneg = MsubSIMD( Four_Twos, boxC.z, boxTopZabs );
fltx4 f4TopTri_X_Cpos = Four_TwoThirds * (boxTopTriA_Cpos.x * ( Four_Twos * ( boxTopTriA_Cpos.z - boxTopZabs ) + boxTopTriB_Cpos.z ) + boxTopTriB_Cpos.x * ( boxTopTriA_Cpos.z + Four_Twos * ( boxTopTriB_Cpos.z - boxTopZabs ) ) + boxTopX * (Four_Threes * boxTopZabs - Four_Twos * ( boxTopTriA_Cpos.z + boxTopTriB_Cpos.z ) ) );
fltx4 f4TopTri_Y_Cpos = Four_TwoThirds * (boxTopTriA_Cpos.y * ( Four_Twos * ( boxTopTriA_Cpos.z - boxTopZabs ) + boxTopTriB_Cpos.z ) + boxTopTriB_Cpos.y * ( boxTopTriA_Cpos.z + Four_Twos * ( boxTopTriB_Cpos.z - boxTopZabs ) ) + boxTopY * (Four_Threes * boxTopZabs - Four_Twos * ( boxTopTriA_Cpos.z + boxTopTriB_Cpos.z ) ) );
fltx4 f4TopTri_Z_Cpos = Four_Thirds * (Four_Threes * boxTopZabs * boxTopZabs - Four_Fours * boxTopZabs * (boxTopTriA_Cpos.z + boxTopTriB_Cpos.z) + Four_Twos * (boxTopTriA_Cpos.z * boxTopTriA_Cpos.z + boxTopTriA_Cpos.z * boxTopTriB_Cpos.z + boxTopTriB_Cpos.z*boxTopTriB_Cpos.z));
fltx4 f4TopTri_W_Cpos = Four_TwoThirds * ( Four_Threes * boxTopZabs - Four_Twos * ( boxTopTriA_Cpos.z + boxTopTriB_Cpos.z ) );
fltx4 f4TopTri_X_Cneg = Four_TwoThirds * (boxTopTriA_Cneg.x * ( Four_Twos * ( boxTopTriA_Cneg.z - f4TopTriZ0_Cneg ) + boxTopTriB_Cneg.z ) + boxTopTriB_Cneg.x * ( boxTopTriA_Cneg.z + Four_Twos * ( boxTopTriB_Cneg.z - f4TopTriZ0_Cneg ) ) + f4TopTriX0_Cneg * (Four_Threes * f4TopTriZ0_Cneg - Four_Twos * ( boxTopTriA_Cneg.z + boxTopTriB_Cneg.z ) ) );
fltx4 f4TopTri_Y_Cneg = Four_TwoThirds * (boxTopTriA_Cneg.y * ( Four_Twos * ( boxTopTriA_Cneg.z - f4TopTriZ0_Cneg ) + boxTopTriB_Cneg.z ) + boxTopTriB_Cneg.y * ( boxTopTriA_Cneg.z + Four_Twos * ( boxTopTriB_Cneg.z - f4TopTriZ0_Cneg ) ) + f4TopTriY0_Cneg * (Four_Threes * f4TopTriZ0_Cneg - Four_Twos * ( boxTopTriA_Cneg.z + boxTopTriB_Cneg.z ) ) );
fltx4 f4TopTri_Z_Cneg = Four_Thirds * (Four_Threes * f4TopTriZ0_Cneg * f4TopTriZ0_Cneg - Four_Fours * f4TopTriZ0_Cneg * (boxTopTriA_Cneg.z + boxTopTriB_Cneg.z) + Four_Twos * (boxTopTriA_Cneg.z * boxTopTriA_Cneg.z + boxTopTriA_Cneg.z * boxTopTriB_Cneg.z + boxTopTriB_Cneg.z*boxTopTriB_Cneg.z));
fltx4 f4TopTri_W_Cneg = Four_TwoThirds * ( Four_Threes * f4TopTriZ0_Cneg - Four_Twos * ( boxTopTriA_Cneg.z + boxTopTriB_Cneg.z ) ); fltx4 f4BotTriX0_Cpos = boxC.x - boxA.x - boxB.x; fltx4 f4BotTriY0_Cpos = boxC.y - boxA.y - boxB.y; fltx4 f4BotTriZ0_Cpos = boxC.z - boxA.z - boxB.z + boxCenterZ;
fltx4 f4BotTri_X_Cpos = Four_TwoThirds * (boxBotTriA_Cpos.x * ( Four_Twos * ( boxBotTriA_Cpos.z - f4BotTriZ0_Cpos ) + boxBotTriB_Cpos.z ) + boxBotTriB_Cpos.x * ( boxBotTriA_Cpos.z + Four_Twos * ( boxBotTriB_Cpos.z - f4BotTriZ0_Cpos ) ) + f4BotTriX0_Cpos * (Four_Threes * f4BotTriZ0_Cpos - Four_Twos * ( boxBotTriA_Cpos.z + boxBotTriB_Cpos.z ) ) );
fltx4 f4BotTri_Y_Cpos = Four_TwoThirds * (boxBotTriA_Cpos.y * ( Four_Twos * ( boxBotTriA_Cpos.z - f4BotTriZ0_Cpos ) + boxBotTriB_Cpos.z ) + boxBotTriB_Cpos.y * ( boxBotTriA_Cpos.z + Four_Twos * ( boxBotTriB_Cpos.z - f4BotTriZ0_Cpos ) ) + f4BotTriY0_Cpos * (Four_Threes * f4BotTriZ0_Cpos - Four_Twos * ( boxBotTriA_Cpos.z + boxBotTriB_Cpos.z ) ) );
fltx4 f4BotTri_Z_Cpos = Four_Thirds * (Four_Threes * f4BotTriZ0_Cpos * f4BotTriZ0_Cpos - Four_Fours * f4BotTriZ0_Cpos * (boxBotTriA_Cpos.z + boxBotTriB_Cpos.z) + Four_Twos * (boxBotTriA_Cpos.z * boxBotTriA_Cpos.z + boxBotTriA_Cpos.z * boxBotTriB_Cpos.z + boxBotTriB_Cpos.z*boxBotTriB_Cpos.z));
fltx4 f4BotTri_W_Cpos = Four_TwoThirds * ( Four_Threes * f4BotTriZ0_Cpos - Four_Twos * ( boxBotTriA_Cpos.z + boxBotTriB_Cpos.z ) );
fltx4 f4BotTriZ0_Cneg = boxCenterZ - boxTopZrel; fltx4 f4BotTri_X_Cneg = Four_TwoThirds * (boxBotTriA_Cneg.x * ( Four_Twos * ( boxBotTriA_Cneg.z - f4BotTriZ0_Cneg ) + boxBotTriB_Cneg.z ) + boxBotTriB_Cneg.x * ( boxBotTriA_Cneg.z + Four_Twos * ( boxBotTriB_Cneg.z - f4BotTriZ0_Cneg ) ) -boxTopX * (Four_Threes * f4BotTriZ0_Cneg - Four_Twos * ( boxBotTriA_Cneg.z + boxBotTriB_Cneg.z ) ) );
fltx4 f4BotTri_Y_Cneg = Four_TwoThirds * (boxBotTriA_Cneg.y * ( Four_Twos * ( boxBotTriA_Cneg.z - f4BotTriZ0_Cneg ) + boxBotTriB_Cneg.z ) + boxBotTriB_Cneg.y * ( boxBotTriA_Cneg.z + Four_Twos * ( boxBotTriB_Cneg.z - f4BotTriZ0_Cneg ) ) -boxTopY * (Four_Threes * f4BotTriZ0_Cneg - Four_Twos * ( boxBotTriA_Cneg.z + boxBotTriB_Cneg.z ) ) );
fltx4 f4BotTri_Z_Cneg = Four_Thirds * (Four_Threes * f4BotTriZ0_Cneg * f4BotTriZ0_Cneg - Four_Fours * f4BotTriZ0_Cneg * (boxBotTriA_Cneg.z + boxBotTriB_Cneg.z) + Four_Twos * (boxBotTriA_Cneg.z * boxBotTriA_Cneg.z + boxBotTriA_Cneg.z * boxBotTriB_Cneg.z + boxBotTriB_Cneg.z*boxBotTriB_Cneg.z));
fltx4 f4BotTri_W_Cneg = Four_TwoThirds * ( Four_Threes * f4BotTriZ0_Cneg - Four_Twos * ( boxBotTriA_Cneg.z + boxBotTriB_Cneg.z ) );
fltx4 f4All_X_Cpos = MaskedAssign( isTopTri_Cpos, f4SideProj * f4Full_X_Cpos - f4TopTriProj_Cpos * f4TopTri_X_Cpos, f4BotTriProj_Cpos * f4BotTri_X_Cpos + f4CenterProj_Cpos * f4Center_X_Cpos ); fltx4 f4All_X_Cneg = MaskedAssign( isTopTri_Cneg, f4SideProj * f4Full_X_Cneg - f4TopTriProj_Cneg * f4TopTri_X_Cneg, f4BotTriProj_Cneg * f4BotTri_X_Cneg + f4CenterProj_Cneg * f4Center_X_Cneg ); fltx4 f4All_Y_Cpos = MaskedAssign( isTopTri_Cpos, f4SideProj * f4Full_Y_Cpos - f4TopTriProj_Cpos * f4TopTri_Y_Cpos, f4BotTriProj_Cpos * f4BotTri_Y_Cpos + f4CenterProj_Cpos * f4Center_Y_Cpos ); fltx4 f4All_Y_Cneg = MaskedAssign( isTopTri_Cneg, f4SideProj * f4Full_Y_Cneg - f4TopTriProj_Cneg * f4TopTri_Y_Cneg, f4BotTriProj_Cneg * f4BotTri_Y_Cneg + f4CenterProj_Cneg * f4Center_Y_Cneg ); fltx4 f4All_Z_Cpos = MaskedAssign( isTopTri_Cpos, f4SideProj * f4Full_Z_Cpos - f4TopTriProj_Cpos * f4TopTri_Z_Cpos, f4BotTriProj_Cpos * f4BotTri_Z_Cpos + f4CenterProj_Cpos * f4Center_Z_Cpos ); fltx4 f4All_Z_Cneg = MaskedAssign( isTopTri_Cneg, f4SideProj * f4Full_Z_Cneg - f4TopTriProj_Cneg * f4TopTri_Z_Cneg, f4BotTriProj_Cneg * f4BotTri_Z_Cneg + f4CenterProj_Cneg * f4Center_Z_Cneg ); fltx4 f4All_W_Cpos = MaskedAssign( isTopTri_Cpos, f4SideProj * f4Full_W_Cpos - f4TopTriProj_Cpos * f4TopTri_W_Cpos, f4BotTriProj_Cpos * f4BotTri_W_Cpos + f4CenterProj_Cpos * f4Center_W_Cpos ); fltx4 f4All_W_Cneg = MaskedAssign( isTopTri_Cneg, f4SideProj * f4Full_W_Cneg - f4TopTriProj_Cneg * f4TopTri_W_Cneg, f4BotTriProj_Cneg * f4BotTri_W_Cneg + f4CenterProj_Cneg * f4Center_W_Cneg );
fltx4 f4All_X = Sum3SIMD( f4All_X_Cpos - f4All_X_Cneg ); fltx4 f4All_Y = Sum3SIMD( f4All_Y_Cpos - f4All_Y_Cneg );
// <Sergiy> to be brutally honest, I don't care about Z integral. It represents the Z of the lever of archimedes force, and
// it affects neither force nor torque exerted by the said force. Not computing it here reduces this routine from 1188 ticks to 900 ticks per run
(void)f4All_Z_Cpos; (void)f4All_Z_Cneg; fltx4 f4All_Z = Four_Zeros;//Sum3SIMD( f4All_Z_Cpos - f4All_Z_Cneg );
fltx4 f4All_W = Sum3SIMD( f4All_W_Cpos - f4All_W_Cneg ); #if 1
// <Sergiy> again, to be brutally honest, I don't care about the actual lever of archimedes force.
// I can just as well use lever * displaced_volume to compute the torque, and it'll actually be more precise, although less understandable.
// this variant returns XYZ of the center of mass of displaced fluid multiplied by W, and W = volume of displaced fluid
fltx4 f4All = CombineSIMD( f4All_X, f4All_Y, f4All_Z, f4All_W ) + f4All_W * boxCenterXY; #else
// this variant returns XYZ of the center of mass of displaced fluid, and W = volume of displaced fluid
fltx4 rcpAllW = ReciprocalSIMD( f4All_W ); fltx4 f4All = SetWSIMD( CombineXYZ_Special( f4All_X, f4All_Y, f4All_Z ) * rcpAllW + boxCenterXY, f4All_W ); #endif
return f4All; }
float GetBoxBuoyancyTest( const matrix3x4_t & tm ) {
} */
Vector4D GetPyramidBuoyancy( const Vector &pos, const Vector &a, const Vector &b, const Vector &n ) { Vector verts[5], verts2[10]; uint numVerts = 4, numVerts2 = 0; verts[0] = pos + n + a + b; verts[1] = pos + n + a - b; verts[2] = pos + n - a - b; verts[3] = pos + n - a + b;
Vector prevVert = verts[3]; for ( uint i = 0; i < numVerts; ++i ) { if ( prevVert.z * verts[i].z < 0 ) { // switching sign
float flFraction = prevVert.z / ( prevVert.z - verts[i].z ); verts2[numVerts2] = prevVert * ( 1 - flFraction ) + verts[i] * flFraction; Assert( fabs( verts2[numVerts2].z ) < 1e-5f ); verts2[numVerts2].z = 0; numVerts2++; } prevVert = verts2[numVerts2++] = verts[i]; }
float flSum = 0, flSign = 1.0f; Vector vecCenter( 0, 0, 0 ); Vector normal = CrossProduct( a, b ); Assert( DotProduct( normal, n ) >= -1e-6f ); if ( DotProduct( pos + n, normal ) < 0 ) // pos + n is the center of the face
{ flSign = -1.0f; }
// exclude all z>0 verts
for ( uint i = 0 ; i < numVerts2; ) { if ( verts2[i].z > 0 ) { for ( uint j = i + 1 ; j < numVerts2; ++j ) { verts2[j-1] = verts2[j]; } --numVerts2; } else { ++i; } }
Vector rootVert = verts2[0];
for ( uint i = 1; i + 1 < numVerts2 ; ++i ) { Vector curVert = verts2[i], nextVert = verts2[i+1]; { // this segment is guaranteed to be under water
float flElementVolume = DotProduct( CrossProduct( curVert, rootVert ), nextVert ) / 6; flElementVolume = fabs( flElementVolume ); flSum += flElementVolume ;
Vector vecElementCenter = ( rootVert + curVert + nextVert ) * 0.25f; vecCenter += flElementVolume * vecElementCenter; } }
Vector4D result; #if 1
result.Init( vecCenter * flSign, flSum * flSign ); #else
result.Init( flSum > 1e-8f ? vecCenter / flSum : Vector( 0, 0, 0 ), flSum * flSign ); #endif
return result; }
/*Vector4D GetQuadBuoyancy( const Vector &pos, const Vector &a, const Vector &b, const Vector &n )
{ Vector verts[4], verts2[10]; uint numVerts = 4, numVerts2 = 0;
Vector acrossb = CrossProduct( a, b );
float flAreaXIntegral = acrossb.x * 4 * ( pos.x + n.x ); float flAreaYIntegral = acrossb.y * 4 * ( pos.y + n.y ); float flAreaZIntegral = acrossb.z * 4 * ( pos.z + n.z ); Vector4D vecIntegral; vecIntegral.w = flAreaZIntegral; Vector center = pos + n; Assert(DotProduct(n, acrossb) > 0);
float x0 = center.x, y0 = center.y, z0 = center.z; float xA = a.x, yA = a.y, zA = a.z; float xB = b.x, yB = b.y, zB = b.z;
vecIntegral.Init( 4* x0 *z0 + (xA* zA + xB*zB)/3, 4* y0 *z0 + (yA* zA + yB*zB)/3, 2* z0 *z0 + (zA* zA + zB*zB)/3, 4* z0);
return vecIntegral * acrossb.z; } */
inline void Swap(Vector&a, Vector&b) { Vector t = a; a = b; b = t; }
Vector4D GetBuoyancy( const Vector &pos, Vector box[3] ) { float rcpZ[3]; for(int i = 0; i < 3; ++i) { if( box[i].z < 0 ) box[i] = -box[i]; for(int j = 0; j < i; ++j) { if(box[j].z < box[i].z) Swap(box[i], box[j]); } } for(int i = 0; i < 3; ++i) rcpZ[i] = box[i].z > 1e-7f? 1 / box[i].z : 0;
uint numVerts = 4, numVerts2 = 0;
Vector acrossb = CrossProduct( a, b );
float flAreaXIntegral = acrossb.x * 4 * ( pos.x + n.x ); float flAreaYIntegral = acrossb.y * 4 * ( pos.y + n.y ); float flAreaZIntegral = acrossb.z * 4 * ( pos.z + n.z );
Vector4D vecIntegral; vecIntegral.w = flAreaZIntegral; Vector center = pos + n; Assert(DotProduct(n, acrossb) > 0);
float x0 = center.x, y0 = center.y, z0 = center.z; float xA = a.x, yA = a.y, zA = a.z; float xB = b.x, yB = b.y, zB = b.z;
vecIntegral.Init( 4* x0 *z0 + (xA* zA + xB*zB)/3, 4* y0 *z0 + (yA* zA + yB*zB)/3, 2* z0 *z0 + (zA* zA + zB*zB)/3, 4* z0);
return vecIntegral * acrossb.z; }*/
Vector4D operator % ( const Vector4D & a, const Vector4D & b ) { Vector4D ave; ave.Init( fabs( a.w + b.w ) > 1e-6f ? ( a.AsVector3D() * a.w + b.AsVector3D() * b.w ) / ( a.w + b.w ) : Vector( 0, 0, 0 ), a.w + b.w ); return ave; }
Vector4D GetBoxBuoyancy( const Vector& a, const Vector& b, const Vector& c, const Vector& pos ) { return GetPyramidBuoyancy( pos, a, b, c ) + GetPyramidBuoyancy( pos, b, a, -c ) + GetPyramidBuoyancy( pos, c, a, b ) + GetPyramidBuoyancy( pos, a, c, -b ) + GetPyramidBuoyancy( pos, b, c, a ) + GetPyramidBuoyancy( pos, c, b, -a ); }
void BenchmarkBoxBuoyancy( Vector a, const Vector& b, const Vector& c, const Vector& pos ) { int start, end; const int nIterations = 100000; Vector4D result;
start = GetHardwareClockFast(); result.Init(0,0,0,0); for ( int i = 0; i < nIterations; ++i ) { result = result % (GetPyramidBuoyancy( pos, a, b, c ) % GetPyramidBuoyancy( pos, b, a, -c ) % GetPyramidBuoyancy( pos, c, a, b ) % GetPyramidBuoyancy( pos, a, c, -b ) % GetPyramidBuoyancy( pos, b, c, a ) % GetPyramidBuoyancy( pos, c, b, -a )) ; a += Vector(1e-24f, 1e-25f, 1e-26f); } end = GetHardwareClockFast(); Msg( "Box Buoyancy Scalar Benchmark: %d ticks/box, volume %g \n", int32( ( end - start ) ) / nIterations, result.w / nIterations ); }
const Vector RotateZ( const Vector & in, float flDegrees ) { Vector res; VectorRotate( in, QAngle(0,flDegrees,0), res ); return res; }
const Vector RotateY( const Vector & in, float flDegrees ) { Vector res; VectorRotate( in, QAngle(flDegrees,0,0), res ); return res; }
const Vector Rotate( const Vector & in, const QAngle &a ) { Vector res; VectorRotate( in, a, res ); return res; }
struct Test_t { void Test() { PermTest(); #ifdef _DEBUG
BuoyancyTest(); #else
Benchmark(); #endif
bool TestAllEqual( const fltx4 & a, const fltx4 & b ) { return IsAllEqual( a, b ); }
void PermTest() { #ifdef _DEBUG
fltx4 f4Canonical = {0.125f, 1.125f, 2.125f, 3.125f}; float flCanonical[4] = {0.125f, 1.125f, 2.125f, 3.125f}; fltx4 f4CanonicalYXZW = {1.125f, 0.125f, 2.125f, 3.125f}; fltx4 f4CanonicalXZYW = {0.125f, 2.125f, 1.125f, 3.125f}; fltx4 f4CanonicalZYXW = {2.125f, 1.125f, 0.125f, 3.125f}; fltx4 f4CanonicalXXYW = {0.125f, 0.125f, 1.125f, 3.125f}; fltx4 f4CanonicalYZZW = {1.125f, 2.125f, 2.125f, 3.125f};
Assert( TestAllEqual( f4Canonical, LoadUnalignedSIMD( flCanonical ) ) );
for ( int i = 0; i < 4; ++i ) { float flSubFloat = SubFloat( f4Canonical, i ); Assert( fabs( flSubFloat - float( i ) - 0.125f ) < 1e-6f ); }
Assert( TestAllEqual( PermYXZW( f4Canonical ), ( f4CanonicalYXZW ) ) ); Assert( TestAllEqual( PermXZYW( f4Canonical ), ( f4CanonicalXZYW ) ) ); Assert( TestAllEqual( PermZYXW( f4Canonical ), ( f4CanonicalZYXW ) ) ); Assert( TestAllEqual( PermXXYW( f4Canonical ), ( f4CanonicalXXYW ) ) ); Assert( TestAllEqual( PermYZZW( f4Canonical ), f4CanonicalYZZW ) ); #endif
void BuoyancyTest() { Vector test[][3] = { {Vector( 1, 0, 0 ), Vector( 0, 0, 1 ), Vector( 0, 0, 0.0f )}, {Vector( 1, 0, 1 ), Vector( -1, 0, 1 ), Vector( 0, 0, -0.5f )}, {Vector( 0, 1, 1 ), Vector( 0, -1, 1 ), Vector( 0, 0, 0.0f )}, {Vector( 0, 2, 2 ), Vector( 0, -2, 2 ), Vector( 0, 0, 0.0f )}, {Vector( 5, 0, 5 ), Vector( -1, 0, 1 ), Vector( 0, 0, 0.0f )}, {Vector( 2, 0, 1 ), Vector( -1, 0, 2 ), Vector( 0, 0, 0.0f )}, {RotateZ(Vector( 1, 0, 1 ),45), RotateZ(Vector( -1, 0, 1 ),45), Vector( 0, 0, 0.0f )}, {RotateZ(Vector( 1, 0, 1 ),30), RotateZ(Vector( -1, 0, 1 ),30), Vector( 0, 0, 0.5f )}, {RotateZ(Vector( sqrtf(0.5f), 0, sqrtf(0.5f) ),45), RotateZ(Vector( 0, 1, 0 ),45), Vector( 0, 0, 0.5f )}, {RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,0)}, // unit cube with tips extended high/low
{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,0.01f)}, // unit cube with tips extended high/low
{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,0.25f)}, // unit cube with tips extended high/low
{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,0.5f)}, // unit cube with tips extended high/low
{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,-0.25f)}, // unit cube with tips extended high/low
{RotateY(RotateZ(Vector(1,0,0),45),atan(sqrtf(2))*180/M_PI),RotateY(RotateZ(Vector(0,1,0),45),atan(sqrtf(2))*180/M_PI), Vector(0,0,-0.5f)}, // unit cube with tips extended high/low
{Vector( 2, 1, 1 ), Vector( -1, 1, 1 ), Vector( 0, 0, 0.0f )}, {Vector( 2, 1, 1 ), Vector( -1, 1, 1 ), Vector( 0, 0, 0.5f )}, {Vector( 0, 2, 1 ).Normalized(), Vector( 1, -1, 2 ).Normalized(), Vector( 0, 0, 0 )}, {Vector( -0.804987f, 0.250343f, -0.811212f ), Vector( 0.474009f, -0.625978f,-0.663551f ).Normalized(), Vector( 1, 0, 0 )} };
float flMaxError = 0; for ( int nAttempt = 0, numAttempts = 1000000; nAttempt < numAttempts; ++nAttempt ) { Vector a = RandomVector( -1, 1 ), c = RandomVector( -1, 1 ), b = CrossProduct( a, c ), pos = RandomVector( -2, 2 ); c = CrossProduct( a, b ).Normalized() * RandomVector( 0, 1.75f ).x; if ( nAttempt < sizeof( test ) / sizeof( test[0] ) ) { a = test[nAttempt][0]; b = test[nAttempt][1]; c = CrossProduct( a, b ).Normalized() /* a.Length()*/; pos = test[nAttempt][2]; } //pos.x = 0;
//pos.y = 0;
//pos.z = 0;
matrix3x4_t tm; tm.Init( a, b, c, pos ); FourVectors box; box.LoadAndSwizzle( LoadUnalignedSIMD( &a ), LoadUnalignedSIMD( &b ), LoadUnalignedSIMD( &c ), LoadUnalignedSIMD( &pos ) );
//fltx4 f4Result0 = GetBoxBuoyancy3x4( box );
Vector4D result1 = GetBoxBuoyancy(a,b,c,pos); fltx4 f4ResultV2 = GetBoxBuoyancy3x4( box ); fltx4 f4Residual = f4ResultV2 - LoadUnalignedSIMD( &result1 ); float flError = sqrtf( SubFloat( Dot4SIMD( f4Residual, f4Residual ), 0 ) ); if( flError > flMaxError ) { flMaxError = flError; Msg( "%d. Error %g\n", nAttempt, flError); }
Assert( IsAllGreaterThan( ReplicateX4( 1e-4f ), fabs( f4Residual ) ) );
float flBoxVolume = a.Length() * b.Length() * c.Length() * 8; (void)(flBoxVolume); // debug only
if ( ( nAttempt % ( numAttempts / 10 ) ) == 0 ) { DevMsg( "." ); } } DevMsg( "Buoyancy test completed, benchmarking\n" ); }
void Benchmark() { for ( int i = 0; i < 100; ++i ) { Vector a = RandomVector( -1, 1 ), c = RandomVector( -1, 1 ), b = CrossProduct( a, c ), pos = RandomVector( -2, 2 ); c = CrossProduct( a, b ).Normalized() * RandomVector( 0, 1.75f ).x; BenchmarkBoxBuoyancy4x3( LoadUnalignedSIMD( &a ), LoadUnalignedSIMD( &b ), LoadUnalignedSIMD( &c ), LoadUnalignedSIMD( &pos ) ); BenchmarkBoxBuoyancy( a,b,c,pos ); } } }; static Test_t s_test;
void TestBuoyancy() { s_test.Test(); }