00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #ifndef __MUDBOXSDK_SSE_H__
00015 #define __MUDBOXSDK_SSE_H__
00016
00017 #if defined(JAMBUILD)
00018 #include <Mudbox/mudbox.h>
00019 #else
00020 #include "mudbox.h"
00021 #endif
00022
00023 #if defined(__GNUC__)
00024
00025 #ifndef __SSE3__
00026 #error This file was intended to compiled with SSE3 instruction set enabled.
00027 #endif
00028
00029
00030
00031
00032
00033
00034 #include <xmmintrin.h>
00035 #include <pmmintrin.h>
00036 #endif
00037
00038 #if defined(WIN32) || defined(WIN64)
00039 #include <xmmintrin.h>
00040 #include <intrin.h>
00041 #endif
00042
00043 #ifndef MB_SSE_ALIGN16_VAR
00044
00045 #if defined(_MSC_VER)
00046 #define MB_SSE_ALIGN16_VAR(v) __declspec(align(16)) v
00047 #define MB_SSE_ALIGN16_CLASS __declspec(align(16))
00048 #elif defined(__GNUC__)
00049 #define MB_SSE_ALIGN16_VAR(v) v __attribute__ ((aligned(16)))
00050 #define MB_SSE_ALIGN16_CLASS __attribute__ ((aligned(16)))
00051 #else
00052 #error The MB_SSE_ALIGN16_VAR needs to be ported for this compiler.
00053 #endif
00054
00055 #endif
00056
00057
00058 struct MBDLL_DECL HWVector
00059 {
00060 inline HWVector( void ) {};
00061
00062 inline void setZero( void )
00063 {
00064 v = _mm_setzero_ps();
00065 }
00066
00067 inline HWVector( float f )
00068 {
00069 v = _mm_set_ps1( f );
00070 }
00071
00072 inline HWVector( const mudbox::Vector &h )
00073 {
00074 v = _mm_set_ps( h.x, h.y, h.z, 0);
00075 };
00076 inline HWVector( float x, float y, float z, float w = 0 )
00077 {
00078 v = _mm_set_ps( x,y,z,w );
00079 };
00080 void Fill( const mudbox::Vector &vVector );
00081 inline HWVector( const float a[4] )
00082 {
00083 v = _mm_loadu_ps( a );
00084 };
00085 inline HWVector ShiftLeft( void ) const { HWVector r; r.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,1,3,0)); return r; };
00086 inline void ShiftLeft(HWVector &result) { result.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,1,3,0)); }
00087 inline void ShiftLeftInPlace() { v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,1,3,0)); }
00088 inline HWVector ShiftRight( void ) const { HWVector r; r.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(1,3,2,0)); return r; };
00089 inline void ShiftRight( HWVector &result ) { result.v = _mm_shuffle_ps(v,v,_MM_SHUFFLE(1,3,2,0)); };
00090 inline HWVector operator &( HWVector &o )
00091 {
00092
00093 HWVector al, bl, ar, br;
00094 ShiftLeft(al);
00095 o.ShiftLeft(bl);
00096 ShiftRight(ar);
00097 o.ShiftRight(br);
00098
00099 return al*br-ar*bl;
00100 };
00101
00102 inline HWVector operator |( const HWVector &o ) const
00103 {
00104 HWVector r;
00105 r.v = _mm_mul_ps( v, o.v );
00106
00107 r.v = _mm_hadd_ps( r.v, r.v );
00108 r.v = _mm_hadd_ps( r.v, r.v );
00109 return r;
00110 };
00111
00112 inline HWVector Length( void ) const
00113 {
00114 HWVector r = operator |( *this );
00115 r.v = _mm_sqrt_ss( r.v );
00116 return r;
00117 };
00118
00119 inline HWVector LengthSquared( void ) const
00120 {
00121 HWVector r = operator |( *this );
00122 return r;
00123 };
00124
00125 inline float DistanceFromLine( const HWVector &vStart,const HWVector &vEnd ) const
00126 {
00127 HWVector n = vEnd - vStart;
00128 HWVector m = (*this)-vStart;
00129 HWVector h = n&m;
00130 HWVector d = n&h;
00131 d.Normalize();
00132 float fDistance = (d|(*this))-(d|vStart);
00133 return fDistance>0?fDistance:(-fDistance);
00134 }
00135
00136 inline void Normalize( void )
00137 {
00138 HWVector f = operator |( *this );
00139 f.v = _mm_rsqrt_ps( f.v );
00140 v = _mm_mul_ps( f.v, v );
00141 };
00142
00143 inline HWVector Floor( void )
00144 {
00145 static unsigned int a = 1<<23;
00146 static float twoTo23AsFloat = (float)a;
00147 static const __m128 twoTo23 = _mm_set_ps( twoTo23AsFloat,twoTo23AsFloat,twoTo23AsFloat,twoTo23AsFloat );
00148
00149 __m128 b = _mm_castsi128_ps(_mm_srli_epi32( _mm_slli_epi32( _mm_castps_si128(v),1 ), 1 ));
00150
00151 __m128 d = _mm_sub_ps( _mm_add_ps( _mm_add_ps( _mm_sub_ps( v,twoTo23 ), twoTo23 ),twoTo23 ), twoTo23 );
00152
00153 __m128 largeMaskE = _mm_cmpgt_ps( b, twoTo23 );
00154
00155 __m128 g = _mm_cmplt_ps( v, d );
00156
00157 __m128 h = _mm_cvtepi32_ps( _mm_castps_si128(g) );
00158
00159 __m128 t = _mm_add_ps( d, h );
00160
00161 __m128 w = _mm_and_ps( v, largeMaskE );
00162 t = _mm_andnot_ps( largeMaskE, t );
00163 HWVector vResult;
00164 vResult.v = _mm_or_ps( t, w );
00165 return vResult;
00166 };
00167
00168 inline HWVector Minimum( const HWVector &o ) const { HWVector r; r.v = _mm_min_ps( v, o.v ); return r; };
00169 inline HWVector Maximum( const HWVector &o ) const { HWVector r; r.v = _mm_max_ps( v, o.v ); return r; };
00170 inline HWVector operator +( const HWVector &o ) const { HWVector r; r.v = _mm_add_ps( v, o.v ); return r; };
00171 inline HWVector operator -( const HWVector &o ) const { HWVector r; r.v = _mm_sub_ps( v, o.v ); return r; };
00172 inline void operator +=( const HWVector &o ) { v = _mm_add_ps( v, o.v ); };
00173 inline void operator -=( const HWVector &o ) { v = _mm_sub_ps( v, o.v ); };
00174 inline void operator *=( const HWVector &o ) { v = _mm_mul_ps( v, o.v ); };
00175 inline void operator /=( const HWVector &o ) { v = _mm_div_ps( v, o.v ); };
00176 inline HWVector operator *( const HWVector &o ) const { HWVector r; r.v = _mm_mul_ps( v, o.v ); return r; };
00177 inline HWVector operator *( float f ) const { HWVector r; r.v = _mm_mul_ps( v, _mm_set1_ps( f ) ); return r; };
00178 inline HWVector operator /( const HWVector &o ) const { HWVector r; r.v = _mm_div_ps( v, o.v ); return r; };
00179 inline void operator *=( float f ) { v = _mm_mul_ps( v, _mm_set1_ps(f ) ); };
00180 inline void Store( float *p ) { _mm_storeu_ps( p, v ); };
00181 inline void Load( float f ) { v = _mm_set_ps1( f ); };
00182 inline void StoreNormalAsInt( int *pBuffer ) const
00183 {
00184
00185
00186
00187
00188 static const MB_SSE_ALIGN16_VAR(float c[4]) = { 32740.0f, 32740.0f, 32740.0f, 32740.0f };
00189
00190 __m128 f = _mm_load_ps( c );
00191 f = _mm_mul_ps( f, v );
00192 __m128i i = _mm_cvtps_epi32( f );
00193 _mm_storeu_si128( (__m128i *)pBuffer, i );
00194 };
00195 inline void StoreAsInt( int *pBuffer ) const
00196 {
00197 __m128i i = _mm_cvtps_epi32( v );
00198 _mm_storeu_si128( (__m128i *)pBuffer, i );
00199 };
00200 inline operator float( void ) const
00201 {
00202 MB_SSE_ALIGN16_VAR(float f);
00203 _mm_store_ss( &f, v );
00204 return f;
00205 };
00206 inline operator mudbox::Vector( void ) const
00207 {
00208 mudbox::Vector r;
00209 __m128 t = _mm_shuffle_ps( v, v, _MM_SHUFFLE(2,1,0,3) );
00210 _mm_store_ss( &r.x, t );
00211 t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00212 _mm_store_ss( &r.y, t );
00213 t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00214 _mm_store_ss( &r.z, t );
00215 return r;
00216 };
00217 inline operator mudbox::Vector4( void ) const
00218 {
00219 mudbox::Vector4 r;
00220 __m128 t = _mm_shuffle_ps( v, v, _MM_SHUFFLE(2,1,0,3) );
00221 _mm_store_ss( &r.x, t );
00222 t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00223 _mm_store_ss( &r.y, t );
00224 t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00225 _mm_store_ss( &r.z, t );
00226 t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00227 _mm_store_ss( &r.w, t );
00228 return r;
00229 };
00230
00231 MB_SSE_ALIGN16_VAR(__m128 v);
00232 };
00233
00234 inline mudbox::Vector &operator <<( mudbox::Vector &v, const HWVector &r )
00235 {
00236 __m128 t = _mm_shuffle_ps( r.v, r.v, _MM_SHUFFLE(2,1,0,3) );
00237 _mm_store_ss( &v.x, t );
00238 t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00239 _mm_store_ss( &v.y, t );
00240 t = _mm_shuffle_ps( t, t, _MM_SHUFFLE(2,1,0,3) );
00241 _mm_store_ss( &v.z, t );
00242 return v;
00243
00245
00246
00247
00248
00249
00250 };
00251
00253
00254 struct HWMatrix
00255 {
00256 inline HWMatrix( void ) {};
00257 inline HWMatrix( const mudbox::Matrix &m )
00258 {
00259 r0 = _mm_loadu_ps( &m._11 );
00260 r1 = _mm_loadu_ps( &m._21 );
00261 r2 = _mm_loadu_ps( &m._31 );
00262 r3 = _mm_loadu_ps( &m._41 );
00263 };
00264 void MirrorX( void )
00265 {
00266 r0 = _mm_shuffle_ps( r0, r0, _MM_SHUFFLE(0, 1, 2, 3) );
00267 r1 = _mm_shuffle_ps( r1, r1, _MM_SHUFFLE(0, 1, 2, 3) );
00268 r2 = _mm_shuffle_ps( r2, r2, _MM_SHUFFLE(0, 1, 2, 3) );
00269 r3 = _mm_shuffle_ps( r3, r3, _MM_SHUFFLE(0, 1, 2, 3) );
00270 };
00271 inline HWVector Transform( const HWVector &v ) const
00272 {
00273 static MB_SSE_ALIGN16_VAR(float c[4]) = { 1, 1, 1, 1 };
00274
00275 __m128 v0 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(3,3,3,3) );
00276 __m128 v1 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(2,2,2,2) );
00277 __m128 v2 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(1,1,1,1) );
00278 __m128 v3 = _mm_load_ps( c );
00279
00280 __m128 a0 = _mm_mul_ps( v0, r0 );
00281 __m128 a1 = _mm_mul_ps( v1, r1 );
00282 __m128 a2 = _mm_mul_ps( v2, r2 );
00283 __m128 a3 = _mm_mul_ps( v3, r3 );
00284
00285 __m128 r = _mm_add_ps( a0, _mm_add_ps( a1, _mm_add_ps( a2, a3 ) ) );
00286
00287 HWVector z;
00288 z.v = r;
00289 return z;
00290 };
00291 inline HWVector ProjectedTransform( const HWVector &v ) const
00292 {
00293 static MB_SSE_ALIGN16_VAR(float c[4]) = { 1, 1, 1, 1 };
00294
00295 __m128 v0 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(3,3,3,3) );
00296 __m128 v1 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(2,2,2,2) );
00297 __m128 v2 = _mm_shuffle_ps( v.v, v.v, _MM_SHUFFLE(1,1,1,1) );
00298 __m128 v3 = _mm_load_ps( c );
00299
00300 __m128 a0 = _mm_mul_ps( v0, r0 );
00301 __m128 a1 = _mm_mul_ps( v1, r1 );
00302 __m128 a2 = _mm_mul_ps( v2, r2 );
00303 __m128 a3 = _mm_mul_ps( v3, r3 );
00304
00305 __m128 r = _mm_add_ps( a0, _mm_add_ps( a1, _mm_add_ps( a2, a3 ) ) );
00306 __m128 d = _mm_shuffle_ps( r, r, _MM_SHUFFLE(3,3,3,3) );
00307
00308 HWVector z;
00309 z.v = _mm_div_ps( r, d );
00310 return z;
00311 };
00312
00313 MB_SSE_ALIGN16_VAR(__m128 r0);
00314 MB_SSE_ALIGN16_VAR(__m128 r1);
00315 MB_SSE_ALIGN16_VAR(__m128 r2);
00316 MB_SSE_ALIGN16_VAR(__m128 r3);
00317 };
00318
00319
00320
00322 bool MBDLL_DECL hasSSE3();
00323
00325 bool MBDLL_DECL hasSSE41();
00326
00328 bool MBDLL_DECL hasSSE42();
00329
00331 bool MBDLL_DECL hasAVX256();
00332
00333
00334
00335 #endif