SSE.h

Go to the documentation of this file.
00001 //**************************************************************************/
00002 // Copyright (c) 2008 Autodesk, Inc.
00003 // All rights reserved.
00004 //
00005 // Use of this software is subject to the terms of the Autodesk license
00006 // agreement provided at the time of installation or download, or which
00007 // otherwise accompanies this software in either electronic or hard copy form.
00008 //
00009 //**************************************************************************/
00010 // DESCRIPTION:
00011 // CREATED: October 2008
00012 //**************************************************************************/
00013 
00014 #ifndef __MUDBOXSDK_SSE_H__
00015 #define __MUDBOXSDK_SSE_H__
00016 
00017 #if defined(JAMBUILD)
00018 #include <Mudbox/mudbox.h>
00019 #else
00020 #include "mudbox.h"
00021 #endif
00022 
00023 #if defined(__GNUC__)
00024 
00025 #ifndef __SSE3__
00026 #error This file was intended to compiled with SSE3 instruction set enabled.
00027 #endif
00028 
00029 //
00030 // If you are using GCC instead of the Intel C Compiler, don't forget 
00031 // to specify -I/usr/lib/gcc/i686-apple-darwin9/4.0.1/include when compiling
00032 // a file that use this header.
00033 //
00034 #include <xmmintrin.h>
00035 #include <pmmintrin.h>
00036 #endif
00037 
00038 #if defined(WIN32) || defined(WIN64)
00039 #include <xmmintrin.h>
00040 #include <intrin.h>
00041 #endif
00042 
00043 #ifndef MB_SSE_ALIGN16_VAR
00044 
00045 #if defined(_MSC_VER)
00046 #define MB_SSE_ALIGN16_VAR(v) __declspec(align(16)) v
00047 #define MB_SSE_ALIGN16_CLASS  __declspec(align(16))
00048 #elif defined(__GNUC__)
00049 #define MB_SSE_ALIGN16_VAR(v) v __attribute__ ((aligned(16)))
00050 #define MB_SSE_ALIGN16_CLASS  __attribute__ ((aligned(16)))
00051 #else
00052 #error The MB_SSE_ALIGN16_VAR needs to be ported for this compiler.
00053 #endif
00054 
00055 #endif
00056 
00057 
00058 struct MBDLL_DECL HWVector
00059 {
00060     inline HWVector( void ) {};
00061 
00062     inline void setZero( void ) 
00063     {
00064         v = _mm_setzero_ps();
00065     }
00066 
00067     inline HWVector( float f )
00068     {
00069         v = _mm_set_ps1( f );
00070     }
00071 
00072     inline HWVector( const mudbox::Vector &h ) 
00073     { 
00074         v = _mm_set_ps( h.x, h.y, h.z, 0); 
00075     };
00076     inline HWVector( float x, float y, float z, float w = 0 ) 
00077     { 
00078         v = _mm_set_ps( x,y,z,w ); 
00079     };
00080     void Fill( const mudbox::Vector &vVector );
00081     inline HWVector( const float a[4] )
00082     {
00083         v = _mm_loadu_ps( a );
00084     };
00085     inline HWVector ShiftLeft( void ) const    { HWVector r; r.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,1,3,0)); return r; };
00086     inline void ShiftLeft(HWVector &result)    { result.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,1,3,0)); }
00087     inline void ShiftLeftInPlace()             { v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,1,3,0)); }
00088     inline HWVector ShiftRight( void ) const   { HWVector r; r.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(1,3,2,0)); return r; };
00089     inline void ShiftRight( HWVector &result ) { result.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(1,3,2,0)); };
00090     inline HWVector operator &(  HWVector &o )
00091     {
00092         //HWVector al = (*this).ShiftLeft(), bl = o.ShiftLeft(), ar = (*this).ShiftRight(), br = o.ShiftRight();
00093         HWVector al, bl, ar, br;
00094         ShiftLeft(al);
00095         o.ShiftLeft(bl);
00096         ShiftRight(ar);
00097         o.ShiftRight(br);
00098 
00099         return al*br-ar*bl;
00100     };
00101 
00102     inline HWVector operator |( const HWVector &o ) const
00103     {
00104         HWVector r;
00105         r.v = _mm_mul_ps( v, o.v );
00106         // Requires SSE3
00107         r.v = _mm_hadd_ps( r.v, r.v );
00108         r.v = _mm_hadd_ps( r.v, r.v );
00109         return r;
00110     };
00111 
00112     inline HWVector Length( void ) const
00113     {
00114         HWVector r = operator |( *this );
00115         r.v = _mm_sqrt_ss( r.v );
00116         return r;
00117     };
00118 
00119     inline HWVector LengthSquared( void ) const
00120     {
00121         HWVector r = operator |( *this );
00122         return r;
00123     };
00124 
00125     inline float DistanceFromLine( const HWVector &vStart,const HWVector &vEnd ) const
00126     {
00127         HWVector n = vEnd - vStart;
00128         HWVector m = (*this)-vStart;
00129         HWVector h = n&m;
00130         HWVector d = n&h;
00131         d.Normalize();
00132         float fDistance = (d|(*this))-(d|vStart);
00133         return fDistance>0?fDistance:(-fDistance);
00134     }
00135 
00136     inline void Normalize( void )
00137     {
00138         HWVector f = operator |( *this );
00139         f.v = _mm_rsqrt_ps( f.v );
00140         v = _mm_mul_ps( f.v, v );
00141     };
00142 
00143     inline HWVector Floor( void )
00144     {
00145         static unsigned int a = 1<<23;
00146         static float twoTo23AsFloat = (float)a;
00147         static const __m128 twoTo23 = _mm_set_ps( twoTo23AsFloat,twoTo23AsFloat,twoTo23AsFloat,twoTo23AsFloat );
00148         // b = fabs(v)
00149         __m128 b = _mm_castsi128_ps(_mm_srli_epi32( _mm_slli_epi32( _mm_castps_si128(v),1 ), 1 ));
00150         // The essence of the floor routine
00151         __m128 d = _mm_sub_ps( _mm_add_ps( _mm_add_ps( _mm_sub_ps( v,twoTo23 ), twoTo23 ),twoTo23 ), twoTo23 );
00152         // �1 if v >= 2**23
00153         __m128 largeMaskE = _mm_cmpgt_ps( b, twoTo23 );
00154         // Check for possible off by one error
00155         __m128 g = _mm_cmplt_ps( v, d );
00156         // Convert positive check result to -1.0, negative to 0.0
00157         __m128 h = _mm_cvtepi32_ps( _mm_castps_si128(g) );
00158         // Add in the error if there is one
00159         __m128 t = _mm_add_ps( d, h );
00160         //Select between output result and input value based on v >= 2**23
00161         __m128 w = _mm_and_ps( v, largeMaskE );
00162         t = _mm_andnot_ps( largeMaskE, t );
00163         HWVector vResult;
00164         vResult.v = _mm_or_ps( t, w );
00165         return vResult;
00166     };
00167 
00168     inline HWVector Minimum( const HWVector &o ) const    { HWVector r; r.v = _mm_min_ps( v, o.v ); return r; };
00169     inline HWVector Maximum( const HWVector &o ) const    { HWVector r; r.v = _mm_max_ps( v, o.v ); return r; };
00170     inline HWVector operator +( const HWVector &o ) const { HWVector r; r.v = _mm_add_ps( v, o.v ); return r; };
00171     inline HWVector operator -( const HWVector &o ) const { HWVector r; r.v = _mm_sub_ps( v, o.v ); return r; };
00172     inline void operator +=( const HWVector &o ) { v = _mm_add_ps( v, o.v ); };
00173     inline void operator -=( const HWVector &o ) { v = _mm_sub_ps( v, o.v ); };
00174     inline void operator *=( const HWVector &o ) { v = _mm_mul_ps( v, o.v ); };
00175     inline void operator /=( const HWVector &o ) { v = _mm_div_ps( v, o.v ); };
00176     inline HWVector operator *( const HWVector &o ) const { HWVector r; r.v = _mm_mul_ps( v, o.v ); return r; };
00177     inline HWVector operator *( float f ) const { HWVector r; r.v = _mm_mul_ps( v, _mm_set1_ps( f ) ); return r; };
00178     inline HWVector operator /( const HWVector &o ) const { HWVector r; r.v = _mm_div_ps( v, o.v ); return r; };
00179     inline void operator *=( float f ) { v = _mm_mul_ps( v, _mm_set1_ps(f ) ); };
00180     inline void Store( float *p ) { _mm_storeu_ps( p, v ); };
00181     inline void Load( float f ) { v = _mm_set_ps1( f ); };
00182     inline void StoreNormalAsInt( int *pBuffer ) const
00183     {
00184         //static __declspec(align(16)) float c[4] = { 32766.0f, 32766.0f, 32766.0f, 32766.0f };
00185         // instead of the correct value, we use a little bit smaller number, because after 
00186         // normalization a component can be a littlebit bigger than 1.0. in that case storing it in
00187         // a 16 bit integer would overflow, and artifacts on the surface would appear.
00188         static const MB_SSE_ALIGN16_VAR(float c[4]) = { 32740.0f, 32740.0f, 32740.0f, 32740.0f };
00189 
00190         __m128 f = _mm_load_ps( c );
00191         f = _mm_mul_ps( f, v );
00192         __m128i i = _mm_cvtps_epi32( f );
00193         _mm_storeu_si128( (__m128i *)pBuffer, i );
00194     };
00195     inline void StoreAsInt( int *pBuffer ) const
00196     {
00197         __m128i i = _mm_cvtps_epi32( v );
00198         _mm_storeu_si128( (__m128i *)pBuffer, i );
00199     };
00200     inline operator float( void ) const
00201     {
00202         MB_SSE_ALIGN16_VAR(float f);
00203         _mm_store_ss( &f, v );
00204         return f;
00205     };
00206     inline operator mudbox::Vector( void ) const
00207     {
00208         mudbox::Vector r;
00209         __m128 t = _mm_shuffle_ps( v, v, _MM_SHUFFLE(2,1,0,3) );
00210         _mm_store_ss( &r.x, t );
00211         t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00212         _mm_store_ss( &r.y, t );
00213         t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00214         _mm_store_ss( &r.z, t );
00215         return r;
00216     };
00217     inline operator mudbox::Vector4( void ) const
00218     {
00219         mudbox::Vector4 r;
00220         __m128 t = _mm_shuffle_ps( v, v, _MM_SHUFFLE(2,1,0,3) );
00221         _mm_store_ss( &r.x, t );
00222         t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00223         _mm_store_ss( &r.y, t );
00224         t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00225         _mm_store_ss( &r.z, t );
00226         t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00227         _mm_store_ss( &r.w, t );
00228         return r;
00229     };
00230 
00231     MB_SSE_ALIGN16_VAR(__m128 v);
00232 };
00233 
00234 inline mudbox::Vector &operator <<( mudbox::Vector &v, const HWVector &r )
00235 {
00236     __m128 t = _mm_shuffle_ps( r.v, r.v, _MM_SHUFFLE(2,1,0,3) );
00237     _mm_store_ss( &v.x, t );
00238     t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00239     _mm_store_ss( &v.y, t );
00240     t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00241     _mm_store_ss( &v.z, t );
00242     return v;
00243 
00245     //__m128 t = _mm_shuffle_ps( r.v, r.v, _MM_SHUFFLE(0,1,2,3) );
00246     //int a = ((int *)&v)[3];
00247     //_mm_storeu_ps( &v.x, t );
00248     //((int *)&v)[3] = a;
00249     //return v;
00250 };
00251 
00253 
00254 struct HWMatrix
00255 {
00256     inline HWMatrix( void ) {};
00257     inline HWMatrix( const mudbox::Matrix &m )
00258     {
00259         r0 = _mm_loadu_ps( &m._11 );
00260         r1 = _mm_loadu_ps( &m._21 );
00261         r2 = _mm_loadu_ps( &m._31 );
00262         r3 = _mm_loadu_ps( &m._41 );
00263     };
00264     void MirrorX( void )
00265     {
00266         r0 = _mm_shuffle_ps( r0, r0, _MM_SHUFFLE(0, 1, 2, 3) );
00267         r1 = _mm_shuffle_ps( r1, r1, _MM_SHUFFLE(0, 1, 2, 3) );
00268         r2 = _mm_shuffle_ps( r2, r2, _MM_SHUFFLE(0, 1, 2, 3) );
00269         r3 = _mm_shuffle_ps( r3, r3, _MM_SHUFFLE(0, 1, 2, 3) );
00270     };
00271     inline HWVector Transform( const HWVector &v ) const
00272     {
00273         static MB_SSE_ALIGN16_VAR(float c[4]) = { 1, 1, 1, 1 };
00274 
00275         __m128 v0 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(3,3,3,3) );
00276         __m128 v1 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(2,2,2,2) );
00277         __m128 v2 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(1,1,1,1) );
00278         __m128 v3 = _mm_load_ps( c );
00279 
00280         __m128 a0 = _mm_mul_ps( v0, r0 );
00281         __m128 a1 = _mm_mul_ps( v1, r1 );
00282         __m128 a2 = _mm_mul_ps( v2, r2 );
00283         __m128 a3 = _mm_mul_ps( v3, r3 );
00284 
00285         __m128 r = _mm_add_ps( a0, _mm_add_ps( a1, _mm_add_ps( a2, a3 ) ) );
00286 
00287         HWVector z;
00288         z.v = r;
00289         return z;
00290     };
00291     inline HWVector ProjectedTransform( const HWVector &v ) const
00292     {
00293         static MB_SSE_ALIGN16_VAR(float c[4]) = { 1, 1, 1, 1 };
00294 
00295         __m128 v0 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(3,3,3,3) );
00296         __m128 v1 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(2,2,2,2) );
00297         __m128 v2 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(1,1,1,1) );
00298         __m128 v3 = _mm_load_ps( c );
00299 
00300         __m128 a0 = _mm_mul_ps( v0, r0 );
00301         __m128 a1 = _mm_mul_ps( v1, r1 );
00302         __m128 a2 = _mm_mul_ps( v2, r2 );
00303         __m128 a3 = _mm_mul_ps( v3, r3 );
00304 
00305         __m128 r = _mm_add_ps( a0, _mm_add_ps( a1, _mm_add_ps( a2, a3 ) ) );
00306         __m128 d = _mm_shuffle_ps( r, r, _MM_SHUFFLE(3,3,3,3) );
00307 
00308         HWVector z;
00309         z.v = _mm_div_ps( r, d );
00310         return z;
00311     };
00312     
00313     MB_SSE_ALIGN16_VAR(__m128 r0);
00314     MB_SSE_ALIGN16_VAR(__m128 r1);
00315     MB_SSE_ALIGN16_VAR(__m128 r2);
00316     MB_SSE_ALIGN16_VAR(__m128 r3);
00317 };
00318 
00319 
00320 //-----------------------------------------------------------------------------
00322 bool MBDLL_DECL hasSSE3();
00323 
00325 bool MBDLL_DECL hasSSE41();
00326 
00328 bool MBDLL_DECL hasSSE42();
00329 
00331 bool MBDLL_DECL hasAVX256();
00332 
00333 //-----------------------------------------------------------------------------
00334 
00335 #endif