//////////////////////////////////////////////////////////////////////////
//
// Implementaiton of iterative 3x3 SVD without branches, using approximate Givens rotations,
// applied sequentially to every off-diagonal element. The same code can compile into scalar, SSE and AVX
// by templetizing on the Float data type. 
//

#include "ssemath.h"

namespace SVD
{
	inline fltx4 CmpLt( const fltx4 &a, const fltx4 &b ) { return CmpLtSIMD( a, b ); }
	inline bool CmpLt( float a, float b ) { return a < b ? true : false; }
	
	inline bool AllLe( const fltx4 &a, const fltx4 &b ){ return IsAllGreaterThanOrEq( b, a ); }
	inline bool AllLe( float a, float b ) { return a <= b; }


	template <typename Float >
	struct FloatTraits
	{
		typedef Float Bool;
	};
	template <>
	struct FloatTraits < float >
	{
		typedef bool Bool;
	};

	template< typename Float > inline Float Replicate( float a );

	template <> inline fltx4 Replicate< fltx4 >( float a )
	{
		return _mm_set1_ps( a );
	}

	template <> inline float Replicate< float >( float a ){ return a; }

	template <typename Float>
	class SymMatrix3;


	inline fltx4 RsqrtEst( const fltx4 &a ){ return ReciprocalSqrtEstSIMD( a ); }
	template <typename Float>
	inline Float Rsqrt( const Float &a )
	{
		fltx4 it = RsqrtEst( a );
		// a single Newton iteration; can repeat multiple times
		return Replicate<Float>( .5f ) * it * ( Replicate<Float>( 3.0f ) - ( a * it * it ) );
	}
	inline float RsqrtEst( float a ) 
	{
		float res;
		StoreUnalignedFloat( &res, RsqrtEst( LoadUnalignedFloatSIMD(&a)  ) ); 
		return res;
	}
	template <>
	inline float Rsqrt( const float &a ) { return 1.0f / sqrtf( a ); }

	inline fltx4 Select( const fltx4& a /*mask=0*/, const fltx4& b/*mask=1*/, const fltx4& mask )
	{
		// (((b ^ a) & mask)^a)
		return _mm_xor_ps( a, _mm_and_ps( mask, _mm_xor_ps( b, a ) ) );
	}

	inline float Select( float a, float b, bool mask )
	{
		return mask ? b : a;
	}


	template <typename Float>
	class Matrix3
	{
	public:
		Matrix3() {}
		Float m[ 3 ][ 3 ];

		Matrix3 < Float > operator * ( const Matrix3< Float > &other )const
		{
			Matrix3 < Float > res;
			for ( int i = 0; i < 3; ++i )
				for ( int j = 0; j < 3; ++j )
					res.m[ i ][ j ] = m[ i ][ 0 ] * other.m[ 0 ][ j ] + m[ i ][ 1 ] * other.m[ 1 ][ j ] + m[ i ][ 2 ] * other.m[ 2 ][ j ];
			return res;
		}
		Matrix3 < Float > operator - ( const Matrix3< Float > &other )const
		{
			Matrix3 < Float > res;
			for ( int i = 0; i < 3; ++i )
				for ( int j = 0; j < 3; ++j )
					res.m[ i ][ j ] = m[ i ][ j ] - other.m[ i ][ j ];
			return res;
		}
		explicit Matrix3< Float >( const SymMatrix3< Float > &other );
		Float FrobeniusNorm()const
		{
			Float sum = Replicate<Float>( 0.0f );
			for ( int i = 0; i < 3; ++i )
				for ( int j = 0; j < 3; ++j )
					sum += m[ i ][ j ] * m[ i ][ j ];
			return sum;
		}

		Float AtA( int i, int j )const
		{
			return m[ 0 ][ i ] * m[ 0 ][ j ] + m[ 1 ][ i ] * m[ 1 ][ j ] + m[ 2 ][ i ] * m[ 2 ][ j ];
		}

		void SetIdentity()
		{
			m[ 0 ][ 0 ] = m[ 1 ][ 1 ] = m[ 2 ][ 2 ] = Replicate<Float>( 1.0f );
			m[ 0 ][ 1 ] = m[ 1 ][ 0 ] = m[ 0 ][ 2 ] = m[ 2 ][ 0 ] = m[ 2 ][ 1 ] = m[ 1 ][ 2 ] = Replicate<Float>( 0.0f );
		}
		void SetZero()
		{
			memset( this, 0, sizeof( *this ) );
		}
		Float ColLenSqr( int j )const
		{
			return m[ 0 ][ j ] * m[ 0 ][ j ] + m[ 1 ][ j ] * m[ 1 ][ j ] + m[ 2 ][ j ] * m[ 2 ][ j ];
		}
		Float Det()const
		{
			return -( m[ 0 ][ 2 ]*m[ 1 ][ 1 ]*m[ 2 ][ 0 ] ) + m[ 0 ][ 1 ]*m[ 1 ][ 2 ]*m[ 2 ][ 0 ] + m[ 0 ][ 2 ]*m[ 1 ][ 0 ]*m[ 2 ][ 1 ] - m[ 0 ][ 0 ]*m[ 1 ][ 2 ]*m[ 2 ][ 1 ] - m[ 0 ][ 1 ]*m[ 1 ][ 0 ]*m[ 2 ][ 2 ] + m[ 0 ][ 0 ]*m[ 1 ][ 1 ]*m[ 2 ][ 2 ];
		}
	};


	template <typename Float>
	inline Matrix3 < Float > MulT( const Matrix3< Float > &u, const Matrix3< Float > &vt )
	{
		Matrix3 < Float > res;
		for ( int i = 0; i < 3; ++i )
			for ( int j = 0; j < 3; ++j )
				res.m[ i ][ j ] = u.m[ i ][ 0 ] * vt.m[ j ][ 0 ] + u.m[ i ][ 1 ] * vt.m[ j ][ 1 ] + u.m[ i ][ 2 ] * vt.m[ j ][ 2 ];
		return res;
	}


	template <typename Float>
	Float OrthogonalityError( const Matrix3<Float> &m )
	{
		Float result = Replicate<Float>( 0.0f );
		for ( int i = 0; i < 2; ++i )
		{
			for ( int j = i + 1; j < 3; ++j )
			{
				Float dot = m.m[ 0 ][ i ] * m.m[ 0 ][ j ] + m.m[ 1 ][ i ] * m.m[ 1 ][ j ] + m.m[ 2 ][ i ] * m.m[ 2 ][ j ];
				result += dot * dot;
			}
		}
		return result;
	}

	template <typename Float >
	class DiagMatrix3
	{
	public:
		Float m[ 3 ];
	};

	template <typename Float>
	class SymMatrix3
	{
	public:
		enum Index_t
		{
			a00, a10, a11, a20, a21, a22, Count, a01 = a10, a02 = a20, a12 = a21
		};

		Float m[ 6 ];
		Float &m00() { return m[ a00 ]; }
		Float &m01() { return m[ a01 ]; }
		Float &m02() { return m[ a02 ]; }
		Float &m11() { return m[ a11 ]; }
		Float &m12() { return m[ a12 ]; }
		Float &m22() { return m[ a22 ]; }

		Float OffDiagNorm()const { return m[ a01 ] * m[ a01 ] + m[ a21 ] * m[ a21 ] + m[ a02 ] * m[ a02 ]; }
		Float DiagNorm()const { return m[ a00 ] * m[ a00 ] + m[ a11 ] * m[ a11 ] + m[ a22 ] * m[ a22 ]; }
	};


	template < typename Float >
	Matrix3< Float >::Matrix3( const SymMatrix3< Float > &other )
	{
		m[ 0 ][ 0 ] = other.m[ other.a00 ];
		m[ 0 ][ 1 ] = m[ 1 ][ 0 ] = other.m[ other.a01 ];
		m[ 1 ][ 1 ] = other.m[ other.a11 ];
		m[ 2 ][ 0 ] = m[ 0 ][ 2 ] = other.m[ other.a01 ];
		m[ 2 ][ 1 ] = m[ 1 ][ 2 ] = other.m[ other.a01 ];
		m[ 2 ][ 2 ] = other.m[ other.a22 ];
	}


	template <typename Float >
	class SinCos
	{
	public:
		Float s, c;
		SinCos() {}
		SinCos( const Float &_c, const Float &_s ) : c( _c ), s( _s ) {}

		SinCos< Float> DoubleAngle()const
		{
			SinCos< Float> res;
			res.s = Replicate<Float>( 2.0f ) * s * c;
			res.c = c * c - s * s;
			return res;
		}
	};

	template <typename Float>
	class Quaternion
	{
	public:
		Float x, y, z, w;

		void SetIdentity()
		{
			x = y = z = Replicate<float>( 0.0f );
			w = Replicate<float>( 1.0f );
		}

		Quaternion<Float> operator * ( const Float &f )const
		{
			Quaternion< Float > res;
			res.x = x * f;
			res.y = y * f;
			res.z = z * f;
			res.w = w * f;
			return res;
		}

		Float LengthSqr() const
		{
			return x * x + y * y + z * z + w * w;
		}
	};

	template <typename Float>
	Matrix3<Float> QuaternionMatrix( const Quaternion<Float> &q )
	{
		Matrix3<Float> matrix;
		const Float one = Replicate<Float>( 1.0f ), two = Replicate<Float>( 2.0f );
		matrix.m[ 0 ][ 0 ] = one - two * q.y * q.y - two * q.z * q.z;
		matrix.m[ 1 ][ 0 ] = two * q.x * q.y + two * q.w * q.z;
		matrix.m[ 2 ][ 0 ] = two * q.x * q.z - two * q.w * q.y;
		matrix.m[ 0 ][ 1 ] = two * q.x * q.y - two * q.w * q.z;
		matrix.m[ 1 ][ 1 ] = one - two * q.x * q.x - two * q.z * q.z;
		matrix.m[ 2 ][ 1 ] = two * q.y * q.z + two * q.w * q.x;
		matrix.m[ 0 ][ 2 ] = two * q.x * q.z + two * q.w * q.y;
		matrix.m[ 1 ][ 2 ] = two * q.y * q.z - two * q.w * q.x;
		matrix.m[ 2 ][ 2 ] = one - two * q.x * q.x - two * q.y * q.y;
		return matrix;
	}

	template <typename Float >
	inline SymMatrix3< Float > AtA( const Matrix3< Float > &a )
	{
		SymMatrix3< Float > res;
		res.m[ res.a00 ] = a.AtA( 0, 0 );
		res.m[ res.a10 ] = a.AtA( 1, 0 );
		res.m[ res.a11 ] = a.AtA( 1, 1 );
		res.m[ res.a20 ] = a.AtA( 2, 0 );
		res.m[ res.a21 ] = a.AtA( 2, 1 );
		res.m[ res.a22 ] = a.AtA( 2, 2 );
		return res;
	}


	template <typename Float >
	inline SymMatrix3< Float > QtAQ( const Matrix3< Float > &q, const SymMatrix3< Float > &a )
	{
		SymMatrix3< Float > res;
		res.m[ res.a00 ] = q.m[ 0 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a02 ] * q.m[ 2 ][ 0 ] ) +
			q.m[ 1 ][ 0 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 2 ][ 0 ] ) +
			q.m[ 2 ][ 0 ] * ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 0 ] + a.m[ a.a22 ] * q.m[ 2 ][ 0 ] );
		res.m[ res.a01 ] =
			q.m[ 0 ][ 1 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a02 ] * q.m[ 2 ][ 0 ] ) +
			q.m[ 1 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 2 ][ 0 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 0 ] +
			a.m[ a.a22 ] * q.m[ 2 ][ 0 ] ) * q.m[ 2 ][ 1 ];
		res.m[ res.a02 ] =
			q.m[ 0 ][ 2 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a02 ] * q.m[ 2 ][ 0 ] ) +
			q.m[ 1 ][ 2 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 2 ][ 0 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 0 ] +
			a.m[ a.a22 ] * q.m[ 2 ][ 0 ] ) * q.m[ 2 ][ 2 ];
		res.m[ res.a11 ] =
			q.m[ 0 ][ 1 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 1 ] + a.m[ a.a01 ] * q.m[ 1 ][ 1 ] + a.m[ a.a02 ] * q.m[ 2 ][ 1 ] ) +
			q.m[ 1 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a11 ] * q.m[ 1 ][ 1 ] + a.m[ a.a12 ] * q.m[ 2 ][ 1 ] ) +
			q.m[ 2 ][ 1 ] * ( a.m[ a.a02 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 1 ][ 1 ] + a.m[ a.a22 ] * q.m[ 2 ][ 1 ] );
		res.m[ res.a12 ] =
			q.m[ 0 ][ 2 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 1 ] + a.m[ a.a01 ] * q.m[ 1 ][ 1 ] + a.m[ a.a02 ] * q.m[ 2 ][ 1 ] ) +
			q.m[ 1 ][ 2 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a11 ] * q.m[ 1 ][ 1 ] + a.m[ a.a12 ] * q.m[ 2 ][ 1 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 1 ][ 1 ] +
			a.m[ a.a22 ] * q.m[ 2 ][ 1 ] ) * q.m[ 2 ][ 2 ];
		res.m[ res.a22 ] =
			q.m[ 0 ][ 2 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 2 ] + a.m[ a.a01 ] * q.m[ 1 ][ 2 ] + a.m[ a.a02 ] * q.m[ 2 ][ 2 ] ) +
			q.m[ 1 ][ 2 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 2 ] + a.m[ a.a11 ] * q.m[ 1 ][ 2 ] + a.m[ a.a12 ] * q.m[ 2 ][ 2 ] ) +
			q.m[ 2 ][ 2 ] * ( a.m[ a.a02 ] * q.m[ 0 ][ 2 ] + a.m[ a.a12 ] * q.m[ 1 ][ 2 ] + a.m[ a.a22 ] * q.m[ 2 ][ 2 ] );
		return res;
	}

	template <typename Float >
	inline SymMatrix3< Float > QAQt( const Matrix3< Float > &q, const SymMatrix3< Float > &a )
	{
		SymMatrix3< Float > res;
		res.m[ res.a00 ] = q.m[ 0 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a02 ] * q.m[ 0 ][ 2 ] ) +
			q.m[ 0 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 0 ][ 2 ] ) +
			q.m[ 0 ][ 2 ] * ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 0 ][ 1 ] + a.m[ a.a22 ] * q.m[ 0 ][ 2 ] );
		res.m[ res.a01 ] =
			q.m[ 1 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a02 ] * q.m[ 0 ][ 2 ] ) +
			q.m[ 1 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 0 ][ 2 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 0 ][ 1 ] +
			a.m[ a.a22 ] * q.m[ 0 ][ 2 ] ) * q.m[ 1 ][ 2 ];
		res.m[ res.a02 ] =
			q.m[ 2 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a02 ] * q.m[ 0 ][ 2 ] ) +
			q.m[ 2 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 0 ][ 2 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 0 ][ 1 ] +
			a.m[ a.a22 ] * q.m[ 0 ][ 2 ] ) * q.m[ 2 ][ 2 ];
		res.m[ res.a11 ] =
			q.m[ 1 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 1 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 1 ] + a.m[ a.a02 ] * q.m[ 1 ][ 2 ] ) +
			q.m[ 1 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 1 ] + a.m[ a.a12 ] * q.m[ 1 ][ 2 ] ) +
			q.m[ 1 ][ 2 ] * ( a.m[ a.a02 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 1 ] + a.m[ a.a22 ] * q.m[ 1 ][ 2 ] );
		res.m[ res.a12 ] =
			q.m[ 2 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 1 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 1 ] + a.m[ a.a02 ] * q.m[ 1 ][ 2 ] ) +
			q.m[ 2 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 1 ] + a.m[ a.a12 ] * q.m[ 1 ][ 2 ] ) + ( a.m[ a.a02 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 1 ] +
			a.m[ a.a22 ] * q.m[ 1 ][ 2 ] ) * q.m[ 2 ][ 2 ];
		res.m[ res.a22 ] =
			q.m[ 2 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 2 ][ 0 ] + a.m[ a.a01 ] * q.m[ 2 ][ 1 ] + a.m[ a.a02 ] * q.m[ 2 ][ 2 ] ) +
			q.m[ 2 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 2 ][ 0 ] + a.m[ a.a11 ] * q.m[ 2 ][ 1 ] + a.m[ a.a12 ] * q.m[ 2 ][ 2 ] ) +
			q.m[ 2 ][ 2 ] * ( a.m[ a.a02 ] * q.m[ 2 ][ 0 ] + a.m[ a.a12 ] * q.m[ 2 ][ 1 ] + a.m[ a.a22 ] * q.m[ 2 ][ 2 ] );
		return res;
	}


	template <typename Float >
	inline SymMatrix3< Float > QAQt( const Matrix3< Float > &q, const DiagMatrix3< Float > &a )
	{
		SymMatrix3< Float > res;
		res.m[ res.a00 ] = q.m[ 0 ][ 0 ] * a.m[ 0 ] * q.m[ 0 ][ 0 ] +
			q.m[ 0 ][ 1 ] *  a.m[ 1 ] * q.m[ 0 ][ 1 ]  +
			q.m[ 0 ][ 2 ] *  a.m[ 2 ] * q.m[ 0 ][ 2 ] ;
		res.m[ res.a01 ] =
			q.m[ 1 ][ 0 ] * ( a.m[ 0 ] * q.m[ 0 ][ 0 ]) +
			q.m[ 1 ][ 1 ] * ( a.m[ 1 ] * q.m[ 0 ][ 1 ]) + ( a.m[ 2 ] * q.m[ 0 ][ 2 ] ) * q.m[ 1 ][ 2 ];
		res.m[ res.a02 ] =
			q.m[ 2 ][ 0 ] * ( a.m[ 0 ] * q.m[ 0 ][ 0 ]) +
			q.m[ 2 ][ 1 ] * ( a.m[ 1 ] * q.m[ 0 ][ 1 ] ) + ( a.m[ 2 ] * q.m[ 0 ][ 2 ] ) * q.m[ 2 ][ 2 ];
		res.m[ res.a11 ] =
			q.m[ 1 ][ 0 ] * ( a.m[ 0 ] * q.m[ 1 ][ 0 ]) +
			q.m[ 1 ][ 1 ] * ( a.m[ 1 ] * q.m[ 1 ][ 1 ]) +
			q.m[ 1 ][ 2 ] * ( a.m[ 2 ] * q.m[ 1 ][ 2 ] );
		res.m[ res.a12 ] =
			q.m[ 2 ][ 0 ] * ( a.m[ 0 ] * q.m[ 1 ][ 0 ]) +
			q.m[ 2 ][ 1 ] * (  a.m[ 1 ] * q.m[ 1 ][ 1 ] ) + ( a.m[ 2 ] * q.m[ 1 ][ 2 ] ) * q.m[ 2 ][ 2 ];
		res.m[ res.a22 ] =
			q.m[ 2 ][ 0 ] * ( a.m[ 0 ] * q.m[ 2 ][ 0 ] ) +
			q.m[ 2 ][ 1 ] * ( a.m[ 1 ] * q.m[ 2 ][ 1 ] ) +
			q.m[ 2 ][ 2 ] * ( a.m[ 2 ] * q.m[ 2 ][ 2 ] );
		return res;
	}

	template <typename Float >
	void PerformGivensRotation2x2( const SinCos<Float> &res, Float &a11, Float &a12, Float &a22 )
	{
		const Float two = Replicate<Float>( 2.0f );
		Float cc = res.c * res.c, ss = res.s * res.s, cs = res.c * res.s;
		Float b11 = cc * a11 + two * cs * a12 + ss * a22;
		Float b12 = cs * ( a22 - a11 ) + ( cc - ss ) * a12;
		Float b22 = ss * a11 - two * cs * a12 + cc * a22;

		a11 = b11;
		a12 = b12;
		a22 = b22;
	}


	template <typename Float>
	void UnperformGivensRotation3x3( const SinCos<Float> &r, Float &a00, Float &a01, Float &a11, Float &a02, Float &a12 )
	{
		const Float two = Replicate<Float>( 2.0f );

		Float b00 = a00 * r.c * r.c - r.s * ( two * a01 * r.c - a11 * r.s );
		Float b01 = r.c*( a01 * r.c + a00 * r.s ) - r.s * ( a11 * r.c + a01 * r.s );
		Float b11 = a11 * r.c * r.c + r.s * ( two * a01 * r.c + a00 * r.s );
		Float b02 = a02 *r.c - a12 * r.s;
		Float b12 = a12 *r.c + a02 * r.s;

		a00 = b00;
		a01 = b01;
		a11 = b11;
		a02 = b02;
		a12 = b12;
	}

	template <typename Float>
	void PerformGivensRotation3x3( const SinCos<Float> &r, Float &a00, Float &a01, Float &a11, Float &a02, Float &a12 )
	{
		const Float two = Replicate<Float>( 2.0f );

		Float b00 = a00 * r.c * r.c + r.s * ( two * a01 * r.c + a11 * r.s );
		Float b01 = r.c*( a01 *r.c - a00 * r.s ) + r.s * ( a11 *r.c - a01 * r.s );
		Float b11 = a11 * r.c *r.c - r.s * ( two * a01 * r.c - a00 * r.s );
		Float b02 = a02 *r.c + a12 * r.s;
		Float b12 = a12 *r.c - a02 * r.s;

		a00 = b00;
		a01 = b01;
		a11 = b11;
		a02 = b02;
		a12 = b12;
	}

	inline SinCos< float > ComputeGivensRotation( float a11, float a12, float a22 )
	{
		float theta = fabsf( a11 - a22 ) > 1e-6f ? 0.5f * atanf( 2 * a12 / ( a11 - a22 ) ) : 3.14159265358979323846f / 4;
		SinCos< float >res( cosf( theta ), sinf( theta ) );
#ifdef _DEBUG
		PerformGivensRotation2x2( res, a11, a12, a22 );
		Assert( fabsf( a12 ) < 0.001f * ( 1 + fabsf( a11 ) + fabsf( a22 ) ) );
#endif
		return res;
	}

	template <typename Float >
	inline SinCos< Float> ApproximateGivensRotation( const Float & a11, const Float & a12, const Float & a22 )
	{
		const Float two = Replicate<Float>( 2.0f );
		Float ch = two * ( a11 - a22 );
		Float sh = a12;
		typename FloatTraits<Float>::Bool b = CmpLt( Replicate<Float>( 5.82842712474619f ) * sh*sh, ch*ch );
		Float r2 = ch*ch + sh *sh;
		typename FloatTraits<Float>::Bool bZero = CmpLt( r2, Replicate<Float>( 1e-12f ) );
		Float omega = RsqrtEst( r2 );
		SinCos<Float>res;
		res.s = Select( Replicate<Float>( 0.3826834323650897717284599840304f ), omega * sh, b );
		res.c = Select( Replicate<Float>( 0.92387953251128675612818318939679f ), omega * ch, b );

		res.s = Select( res.s, Replicate<Float>( 0.0f ), bZero ); // todo: replace with And
		res.c = Select( res.c, Replicate<Float>( 1.0f ), bZero );

		return res;
	}



	template <typename Float >
	void PerformGivensRotationQuaternion( const SinCos<Float> &res, Float &x, Float &y, Float &z, Float &w )
	{
		//const Float two = Replicate<Float>( 2.0f );
		Float xNew = res.c * x + res.s * w, yNew = res.c * y + res.s * z, zNew = res.c * z - res.s * y, wNew = res.c * w - res.s * x;
		x = xNew;
		y = yNew;
		z = zNew;
		w = wNew;
	}

	template <typename Float >
	class SvdIterator
	{
	public:
		SvdIterator(){}
		
		Quaternion < Float > q;
		SymMatrix3<Float> ata;

		void Init( const Matrix3<Float> a )
		{
			q.SetIdentity();
			ata = AtA( a );
		}


		void Iterate( int nIterations, float flEpsilon = 0.0f )
		{
			SinCos< Float> r;

			//SymMatrix3<Float> inv0 = QAQt( QuaternionMatrix( q ), ata ), origAta = AtA( a );

			for ( int i = 0; i < nIterations; ++i )
			{
				r = ApproximateGivensRotation( ata.m[ ata.a00 ], ata.m[ ata.a10 ], ata.m[ ata.a11 ] );
				Float sumErrors = r.s * r.s;
				PerformGivensRotation3x3( r.DoubleAngle(), ata.m[ ata.a00 ], ata.m[ ata.a10 ], ata.m[ ata.a11 ], ata.m[ ata.a20 ], ata.m[ ata.a21 ] );
				PerformGivensRotationQuaternion( r, q.z, q.x, q.y, q.w );
				//SymMatrix3<Float> inv1 = QAQt( QuaternionMatrix( q ), ata );

				r = ApproximateGivensRotation( ata.m[ ata.a11 ], ata.m[ ata.a21 ], ata.m[ ata.a22 ] );
				sumErrors += r.s * r.s;
				PerformGivensRotation3x3( r.DoubleAngle(), ata.m[ ata.a11 ], ata.m[ ata.a21 ], ata.m[ ata.a22 ], ata.m[ ata.a01 ], ata.m[ ata.a02 ] );
				PerformGivensRotationQuaternion( r, q.x, q.y, q.z, q.w );
				//SymMatrix3<Float> inv2 = QAQt( QuaternionMatrix( q ), ata );

				r = ApproximateGivensRotation( ata.m[ ata.a22 ], ata.m[ ata.a20 ], ata.m[ ata.a00 ] );
				sumErrors += r.s * r.s;
				PerformGivensRotation3x3( r.DoubleAngle(), ata.m[ ata.a22 ], ata.m[ ata.a02 ], ata.m[ ata.a00 ], ata.m[ ata.a12 ], ata.m[ ata.a10 ] );
				PerformGivensRotationQuaternion( r, q.y, q.z, q.x, q.w );
				//SymMatrix3<Float> inv3 = QAQt( QuaternionMatrix( q ), ata );
				if ( AllLe( sumErrors, Replicate<Float>( flEpsilon ) ) )
					break; // early out
			}
		}

		Matrix3< Float > ComputeV()const { return QuaternionMatrix( q * Rsqrt( q.LengthSqr() ) ); }
	};

	inline float PseudoInverse( float fl ) { return fabsf( fl ) < FLT_EPSILON ? 0 : 1.0f / fl; }

	inline SymMatrix3< float > PseudoInverse( const SymMatrix3< float > &cov )
	{
		SvdIterator< float > si;
		si.q.SetIdentity();
		si.ata = cov;
		si.Iterate( 5 );
		DiagMatrix3< float > pseudoInverseDiag;
		pseudoInverseDiag.m[ 0 ] = PseudoInverse( si.ata.m00() );
		pseudoInverseDiag.m[ 1 ] = PseudoInverse( si.ata.m11() );
		pseudoInverseDiag.m[ 2 ] = PseudoInverse( si.ata.m22() );
		return QAQt( si.ComputeV(), pseudoInverseDiag );
	}
}