//========= Copyright Valve Corporation, All rights reserved. ============// // // Purpose: - defines SIMD "structure of arrays" classes and functions. // //===========================================================================// #ifndef SSEQUATMATH_H #define SSEQUATMATH_H #ifdef _WIN32 #pragma once #endif #include "mathlib/ssemath.h" // Use this #define to allow SSE versions of Quaternion math // to exist on PC. // On PC, certain horizontal vector operations are not supported. // This causes the SSE implementation of quaternion math to mix the // vector and scalar floating point units, which is extremely // performance negative if you don't compile to native SSE2 (which // we don't as of Sept 1, 2007). So, it's best not to allow these // functions to exist at all. It's not good enough to simply replace // the contents of the functions with scalar math, because each call // to LoadAligned and StoreAligned will result in an unnecssary copy // of the quaternion, and several moves to and from the XMM registers. // // Basically, the problem you run into is that for efficient SIMD code, // you need to load the quaternions and vectors into SIMD registers and // keep them there as long as possible while doing only SIMD math, // whereas for efficient scalar code, each time you copy onto or ever // use a fltx4, it hoses your pipeline. So the difference has to be // in the management of temporary variables in the calling function, // not inside the math functions. // // If you compile assuming the presence of SSE2, the MSVC will abandon // the traditional x87 FPU operations altogether and make everything use // the SSE2 registers, which lessens this problem a little. // permitted only on 360, as we've done careful tuning on its Altivec math: #ifdef _X360 #define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC! #endif //--------------------------------------------------------------------- // Load/store quaternions //--------------------------------------------------------------------- #ifndef _X360 #if ALLOW_SIMD_QUATERNION_MATH // Using STDC or SSE FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD ) { fltx4 retval = LoadAlignedSIMD( pSIMD.Base() ); return retval; } FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD ) { fltx4 retval = LoadAlignedSIMD( pSIMD ); return retval; } FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a ) { StoreAlignedSIMD( pSIMD->Base(), a ); } #endif #else // for the transitional class -- load a QuaternionAligned FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD ) { fltx4 retval = XMLoadVector4A( pSIMD.Base() ); return retval; } FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD ) { fltx4 retval = XMLoadVector4A( pSIMD ); return retval; } FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a ) { XMStoreVector4A( pSIMD->Base(), a ); } #endif #if ALLOW_SIMD_QUATERNION_MATH //--------------------------------------------------------------------- // Make sure quaternions are within 180 degrees of one another, if not, reverse q //--------------------------------------------------------------------- FORCEINLINE fltx4 QuaternionAlignSIMD( const fltx4 &p, const fltx4 &q ) { // decide if one of the quaternions is backwards fltx4 a = SubSIMD( p, q ); fltx4 b = AddSIMD( p, q ); a = Dot4SIMD( a, a ); b = Dot4SIMD( b, b ); fltx4 cmp = CmpGtSIMD( a, b ); fltx4 result = MaskedAssign( cmp, NegSIMD(q), q ); return result; } //--------------------------------------------------------------------- // Normalize Quaternion //--------------------------------------------------------------------- #if USE_STDC_FOR_SIMD FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q ) { fltx4 radius, result; radius = Dot4SIMD( q, q ); if ( SubFloat( radius, 0 ) ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON)) { float iradius = 1.0f / sqrt( SubFloat( radius, 0 ) ); result = ReplicateX4( iradius ); result = MulSIMD( result, q ); return result; } return q; } #else // SSE + X360 implementation FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q ) { fltx4 radius, result, mask; radius = Dot4SIMD( q, q ); mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0 result = ReciprocalSqrtSIMD( radius ); result = MulSIMD( result, q ); return MaskedAssign( mask, q, result ); // if radius was 0, just return q } #endif //--------------------------------------------------------------------- // 0.0 returns p, 1.0 return q. //--------------------------------------------------------------------- FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) { fltx4 sclp, sclq, result; sclq = ReplicateX4( t ); sclp = SubSIMD( Four_Ones, sclq ); result = MulSIMD( sclp, p ); result = MaddSIMD( sclq, q, result ); return QuaternionNormalizeSIMD( result ); } //--------------------------------------------------------------------- // Blend Quaternions //--------------------------------------------------------------------- FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t ) { // decide if one of the quaternions is backwards fltx4 q2, result; q2 = QuaternionAlignSIMD( p, q ); result = QuaternionBlendNoAlignSIMD( p, q2, t ); return result; } //--------------------------------------------------------------------- // Multiply Quaternions //--------------------------------------------------------------------- #ifndef _X360 // SSE and STDC FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q ) { // decide if one of the quaternions is backwards fltx4 q2, result; q2 = QuaternionAlignSIMD( p, q ); SubFloat( result, 0 ) = SubFloat( p, 0 ) * SubFloat( q2, 3 ) + SubFloat( p, 1 ) * SubFloat( q2, 2 ) - SubFloat( p, 2 ) * SubFloat( q2, 1 ) + SubFloat( p, 3 ) * SubFloat( q2, 0 ); SubFloat( result, 1 ) = -SubFloat( p, 0 ) * SubFloat( q2, 2 ) + SubFloat( p, 1 ) * SubFloat( q2, 3 ) + SubFloat( p, 2 ) * SubFloat( q2, 0 ) + SubFloat( p, 3 ) * SubFloat( q2, 1 ); SubFloat( result, 2 ) = SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 ); SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 ); return result; } #else // X360 extern const fltx4 g_QuatMultRowSign[4]; FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q ) { fltx4 q2, row, result; q2 = QuaternionAlignSIMD( p, q ); row = XMVectorSwizzle( q2, 3, 2, 1, 0 ); row = MulSIMD( row, g_QuatMultRowSign[0] ); result = Dot4SIMD( row, p ); row = XMVectorSwizzle( q2, 2, 3, 0, 1 ); row = MulSIMD( row, g_QuatMultRowSign[1] ); row = Dot4SIMD( row, p ); result = __vrlimi( result, row, 4, 0 ); row = XMVectorSwizzle( q2, 1, 0, 3, 2 ); row = MulSIMD( row, g_QuatMultRowSign[2] ); row = Dot4SIMD( row, p ); result = __vrlimi( result, row, 2, 0 ); row = MulSIMD( q2, g_QuatMultRowSign[3] ); row = Dot4SIMD( row, p ); result = __vrlimi( result, row, 1, 0 ); return result; } #endif //--------------------------------------------------------------------- // Quaternion scale //--------------------------------------------------------------------- #ifndef _X360 // SSE and STDC FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t ) { float r; fltx4 q; // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale. float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) ); sinom = min( sinom, 1.f ); float sinsom = sin( asin( sinom ) * t ); t = sinsom / (sinom + FLT_EPSILON); SubFloat( q, 0 ) = t * SubFloat( p, 0 ); SubFloat( q, 1 ) = t * SubFloat( p, 1 ); SubFloat( q, 2 ) = t * SubFloat( p, 2 ); // rescale rotation r = 1.0f - sinsom * sinsom; // Assert( r >= 0 ); if (r < 0.0f) r = 0.0f; r = sqrt( r ); // keep sign of rotation SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r ); return q; } #else // X360 FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t ) { fltx4 sinom = Dot3SIMD( p, p ); sinom = SqrtSIMD( sinom ); sinom = MinSIMD( sinom, Four_Ones ); fltx4 sinsom = ArcSinSIMD( sinom ); fltx4 t4 = ReplicateX4( t ); sinsom = MulSIMD( sinsom, t4 ); sinsom = SinSIMD( sinsom ); sinom = AddSIMD( sinom, Four_Epsilons ); sinom = ReciprocalSIMD( sinom ); t4 = MulSIMD( sinsom, sinom ); fltx4 result = MulSIMD( p, t4 ); // rescale rotation sinsom = MulSIMD( sinsom, sinsom ); fltx4 r = SubSIMD( Four_Ones, sinsom ); r = MaxSIMD( r, Four_Zeros ); r = SqrtSIMD( r ); // keep sign of rotation fltx4 cmp = CmpGeSIMD( p, Four_Zeros ); r = MaskedAssign( cmp, r, NegSIMD( r ) ); result = __vrlimi(result, r, 1, 0); return result; } #endif //----------------------------------------------------------------------------- // Quaternion sphereical linear interpolation //----------------------------------------------------------------------------- #ifndef _X360 // SSE and STDC FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) { float omega, cosom, sinom, sclp, sclq; fltx4 result; // 0.0 returns p, 1.0 return q. cosom = SubFloat( p, 0 ) * SubFloat( q, 0 ) + SubFloat( p, 1 ) * SubFloat( q, 1 ) + SubFloat( p, 2 ) * SubFloat( q, 2 ) + SubFloat( p, 3 ) * SubFloat( q, 3 ); if ( (1.0f + cosom ) > 0.000001f ) { if ( (1.0f - cosom ) > 0.000001f ) { omega = acos( cosom ); sinom = sin( omega ); sclp = sin( (1.0f - t)*omega) / sinom; sclq = sin( t*omega ) / sinom; } else { // TODO: add short circuit for cosom == 1.0f? sclp = 1.0f - t; sclq = t; } SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( q, 0 ); SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( q, 1 ); SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( q, 2 ); SubFloat( result, 3 ) = sclp * SubFloat( p, 3 ) + sclq * SubFloat( q, 3 ); } else { SubFloat( result, 0 ) = -SubFloat( q, 1 ); SubFloat( result, 1 ) = SubFloat( q, 0 ); SubFloat( result, 2 ) = -SubFloat( q, 3 ); SubFloat( result, 3 ) = SubFloat( q, 2 ); sclp = sin( (1.0f - t) * (0.5f * M_PI)); sclq = sin( t * (0.5f * M_PI)); SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( result, 0 ); SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( result, 1 ); SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( result, 2 ); } return result; } #else // X360 FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) { return XMQuaternionSlerp( p, q, t ); } #endif FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t ) { fltx4 q2, result; q2 = QuaternionAlignSIMD( p, q ); result = QuaternionSlerpNoAlignSIMD( p, q2, t ); return result; } #endif // ALLOW_SIMD_QUATERNION_MATH #endif // SSEQUATMATH_H