00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00038 #ifndef FCL_MATH_SIMD_DETAILS_H
00039 #define FCL_MATH_SIMD_DETAILS_H
00040
00041 #include "fcl/data_types.h"
00042
00043 #include <xmmintrin.h>
00044 #if defined (__SSE3__)
00045 #include <pmmintrin.h>
00046 #endif
00047 #if defined (__SSE4__)
00048 #include <smmintrin.h>
00049 #endif
00050
00051
00052 namespace fcl
00053 {
00054
00056 namespace details
00057 {
00058
00059 const __m128 xmms_0 = {0.f, 0.f, 0.f, 0.f};
00060 const __m128d xmmd_0 = {0, 0};
00061
00062 static inline __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
00063 {
00064 return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
00065 }
00066 static inline __m128 vec_sel(__m128 a, __m128 b, const unsigned int* mask)
00067 {
00068 return vec_sel(a, b, _mm_load_ps((float*)mask));
00069 }
00070
00071 static inline __m128 vec_sel(__m128 a, __m128 b, unsigned int mask)
00072 {
00073 return vec_sel(a, b, _mm_set1_ps(*(float*)&mask));
00074 }
00075
00076 static inline __m128 vec_splat(__m128 a, int e)
00077 {
00078 return _mm_shuffle_ps(a, a, _MM_SHUFFLE(e, e, e, e));
00079 }
00080
00081 static inline __m128d vec_splat(__m128d a, int e)
00082 {
00083 return _mm_shuffle_pd(a, a, _MM_SHUFFLE2(e, e));
00084 }
00085
00086 static inline __m128 _mm_ror_ps(__m128 x, int e)
00087 {
00088 return (e % 4) ? _mm_shuffle_ps(x, x, _MM_SHUFFLE((e+3)%4, (e+2)%4, (e+1)%4, e%4)) : x;
00089 }
00090
00091 static inline __m128 _mm_rol_ps(__m128 x, int e)
00092 {
00093 return (e % 4) ? _mm_shuffle_ps(x, x, _MM_SHUFFLE((7-e)%4, (6-e)%4, (5-e)%4, (4-e)%4)) : x;
00094 }
00095
00096 static inline __m128 newtonraphson_rsqrt4(const __m128 v)
00097 {
00098 static const union { float i[4]; __m128 m; } _half4 __attribute__ ((aligned(16))) = {{.5f, .5f, .5f, .5f}};
00099 static const union { float i[4]; __m128 m; } _three __attribute__ ((aligned(16))) = {{3.f, 3.f, 3.f, 3.f}};
00100 __m128 approx = _mm_rsqrt_ps(v);
00101 __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);
00102 return _mm_mul_ps(_mm_mul_ps(_half4.m, approx), _mm_sub_ps(_three.m, muls));
00103 }
00104
00105 struct sse_meta_f4
00106 {
00107 typedef float meta_type;
00108
00109 union {float vs[4]; __m128 v; };
00110 sse_meta_f4() : v(_mm_set1_ps(0)) {}
00111 sse_meta_f4(float x) : v(_mm_set1_ps(x)) {}
00112 sse_meta_f4(float* px) : v(_mm_load_ps(px)) {}
00113 sse_meta_f4(__m128 x) : v(x) {}
00114 sse_meta_f4(float x, float y, float z, float w = 1) : v(_mm_setr_ps(x, y, z, w)) {}
00115 inline void setValue(float x, float y, float z, float w = 1) { v = _mm_setr_ps(x, y, z, w); }
00116 inline void setValue(float x) { v = _mm_set1_ps(x); }
00117 inline void setValue(__m128 x) { v = x; }
00118 inline void negate() { v = _mm_sub_ps(xmms_0, v); }
00119
00120 inline sse_meta_f4& ubound(const sse_meta_f4& u)
00121 {
00122 v = _mm_min_ps(v, u.v);
00123 return *this;
00124 }
00125
00126 inline sse_meta_f4& lbound(const sse_meta_f4& l)
00127 {
00128 v = _mm_max_ps(v, l.v);
00129 return *this;
00130 }
00131
00132 inline void* operator new [] (size_t n) { return _mm_malloc(n, 16); }
00133 inline void operator delete [] (void* x) { if(x) _mm_free(x); }
00134 inline float operator [] (size_t i) const { return vs[i]; }
00135 inline float& operator [] (size_t i) { return vs[i]; }
00136
00137 inline sse_meta_f4 operator + (const sse_meta_f4& other) const { return sse_meta_f4(_mm_add_ps(v, other.v)); }
00138 inline sse_meta_f4 operator - (const sse_meta_f4& other) const { return sse_meta_f4(_mm_sub_ps(v, other.v)); }
00139 inline sse_meta_f4 operator * (const sse_meta_f4& other) const { return sse_meta_f4(_mm_mul_ps(v, other.v)); }
00140 inline sse_meta_f4 operator / (const sse_meta_f4& other) const { return sse_meta_f4(_mm_div_ps(v, other.v)); }
00141 inline sse_meta_f4& operator += (const sse_meta_f4& other) { v = _mm_add_ps(v, other.v); return *this; }
00142 inline sse_meta_f4& operator -= (const sse_meta_f4& other) { v = _mm_sub_ps(v, other.v); return *this; }
00143 inline sse_meta_f4& operator *= (const sse_meta_f4& other) { v = _mm_mul_ps(v, other.v); return *this; }
00144 inline sse_meta_f4& operator /= (const sse_meta_f4& other) { v = _mm_div_ps(v, other.v); return *this; }
00145 inline sse_meta_f4 operator + (float t) const { return sse_meta_f4(_mm_add_ps(v, _mm_set1_ps(t))); }
00146 inline sse_meta_f4 operator - (float t) const { return sse_meta_f4(_mm_sub_ps(v, _mm_set1_ps(t))); }
00147 inline sse_meta_f4 operator * (float t) const { return sse_meta_f4(_mm_mul_ps(v, _mm_set1_ps(t))); }
00148 inline sse_meta_f4 operator / (float t) const { return sse_meta_f4(_mm_div_ps(v, _mm_set1_ps(t))); }
00149 inline sse_meta_f4& operator += (float t) { v = _mm_add_ps(v, _mm_set1_ps(t)); return *this; }
00150 inline sse_meta_f4& operator -= (float t) { v = _mm_sub_ps(v, _mm_set1_ps(t)); return *this; }
00151 inline sse_meta_f4& operator *= (float t) { v = _mm_mul_ps(v, _mm_set1_ps(t)); return *this; }
00152 inline sse_meta_f4& operator /= (float t) { v = _mm_div_ps(v, _mm_set1_ps(t)); return *this; }
00153 inline sse_meta_f4 operator - () const
00154 {
00155 static const union { int i[4]; __m128 m; } negativemask __attribute__ ((aligned(16))) = {{0x80000000, 0x80000000, 0x80000000, 0x80000000}};
00156 return sse_meta_f4(_mm_xor_ps(negativemask.m, v));
00157 }
00158 } __attribute__ ((aligned (16)));
00159
00160 struct sse_meta_d4
00161 {
00162 typedef double meta_type;
00163
00164 union {double vs[4]; __m128d v[2]; };
00165 sse_meta_d4()
00166 {
00167 setValue(0.0);
00168 }
00169
00170 sse_meta_d4(double x)
00171 {
00172 setValue(x);
00173 }
00174
00175 sse_meta_d4(double* px)
00176 {
00177 v[0] = _mm_load_pd(px);
00178 v[1] = _mm_set_pd(0, *(px + 2));
00179 }
00180
00181 sse_meta_d4(__m128d x, __m128d y)
00182 {
00183 v[0] = x;
00184 v[1] = y;
00185 }
00186
00187 sse_meta_d4(double x, double y, double z, double w = 0)
00188 {
00189 setValue(x, y, z, w);
00190 }
00191
00192 inline void setValue(double x, double y, double z, double w = 0)
00193 {
00194 v[0] = _mm_setr_pd(x, y);
00195 v[1] = _mm_setr_pd(z, w);
00196 }
00197
00198 inline void setValue(double x)
00199 {
00200 v[0] = _mm_set1_pd(x);
00201 v[1] = v[0];
00202 }
00203
00204 inline void setValue(__m128d x, __m128d y)
00205 {
00206 v[0] = x;
00207 v[1] = y;
00208 }
00209
00210 inline void negate()
00211 {
00212 v[0] = _mm_sub_pd(xmmd_0, v[0]);
00213 v[1] = _mm_sub_pd(xmmd_0, v[1]);
00214 }
00215
00216 inline sse_meta_d4& ubound(const sse_meta_d4& u)
00217 {
00218 v[0] = _mm_min_pd(v[0], u.v[0]);
00219 v[1] = _mm_min_pd(v[1], u.v[1]);
00220 return *this;
00221 }
00222
00223 inline sse_meta_d4& lbound(const sse_meta_d4& l)
00224 {
00225 v[0] = _mm_max_pd(v[0], l.v[0]);
00226 v[1] = _mm_max_pd(v[1], l.v[1]);
00227 return *this;
00228 }
00229
00230 inline void* operator new [] (size_t n)
00231 {
00232 return _mm_malloc(n, 16);
00233 }
00234
00235 inline void operator delete [] (void* x)
00236 {
00237 if(x) _mm_free(x);
00238 }
00239
00240 inline double operator [] (size_t i) const { return vs[i]; }
00241 inline double& operator [] (size_t i) { return vs[i]; }
00242
00243 inline sse_meta_d4 operator + (const sse_meta_d4& other) const { return sse_meta_d4(_mm_add_pd(v[0], other.v[0]), _mm_add_pd(v[1], other.v[1])); }
00244 inline sse_meta_d4 operator - (const sse_meta_d4& other) const { return sse_meta_d4(_mm_sub_pd(v[0], other.v[0]), _mm_sub_pd(v[1], other.v[1])); }
00245 inline sse_meta_d4 operator * (const sse_meta_d4& other) const { return sse_meta_d4(_mm_mul_pd(v[0], other.v[0]), _mm_mul_pd(v[1], other.v[1])); }
00246 inline sse_meta_d4 operator / (const sse_meta_d4& other) const { return sse_meta_d4(_mm_div_pd(v[0], other.v[0]), _mm_div_pd(v[1], other.v[1])); }
00247 inline sse_meta_d4& operator += (const sse_meta_d4& other) { v[0] = _mm_add_pd(v[0], other.v[0]); v[1] = _mm_add_pd(v[1], other.v[1]); return *this; }
00248 inline sse_meta_d4& operator -= (const sse_meta_d4& other) { v[0] = _mm_sub_pd(v[0], other.v[0]); v[1] = _mm_sub_pd(v[1], other.v[1]); return *this; }
00249 inline sse_meta_d4& operator *= (const sse_meta_d4& other) { v[0] = _mm_mul_pd(v[0], other.v[0]); v[1] = _mm_mul_pd(v[1], other.v[1]); return *this; }
00250 inline sse_meta_d4& operator /= (const sse_meta_d4& other) { v[0] = _mm_div_pd(v[0], other.v[0]); v[1] = _mm_div_pd(v[1], other.v[1]); return *this; }
00251 inline sse_meta_d4 operator + (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_add_pd(v[0], d), _mm_add_pd(v[1], d)); }
00252 inline sse_meta_d4 operator - (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_sub_pd(v[0], d), _mm_sub_pd(v[1], d)); }
00253 inline sse_meta_d4 operator * (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_mul_pd(v[0], d), _mm_mul_pd(v[1], d)); }
00254 inline sse_meta_d4 operator / (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_div_pd(v[0], d), _mm_div_pd(v[1], d)); }
00255 inline sse_meta_d4& operator += (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_add_pd(v[0], d); v[1] = _mm_add_pd(v[1], d); return *this; }
00256 inline sse_meta_d4& operator -= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_sub_pd(v[0], d); v[1] = _mm_sub_pd(v[1], d); return *this; }
00257 inline sse_meta_d4& operator *= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_mul_pd(v[0], d); v[1] = _mm_mul_pd(v[1], d); return *this; }
00258 inline sse_meta_d4& operator /= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_div_pd(v[0], d); v[1] = _mm_div_pd(v[1], d); return *this; }
00259 inline sse_meta_d4 operator - () const
00260 {
00261 static const union { FCL_INT64 i[2]; __m128d m; } negativemask __attribute__ ((aligned(16))) = {{0x8000000000000000, 0x8000000000000000}};
00262 return sse_meta_d4(_mm_xor_pd(v[0], negativemask.m), _mm_xor_pd(v[1], negativemask.m));
00263 }
00264 } __attribute__ ((aligned (16)));
00265
00266
00267
00268 static inline __m128 cross_prod(__m128 x, __m128 y)
00269 {
00270
00271
00272 static const int s1 = _MM_SHUFFLE(3, 0, 2, 1);
00273 static const int s2 = _MM_SHUFFLE(3, 1, 0, 2);
00274 __m128 xa = _mm_mul_ps(_mm_shuffle_ps(x, x, s1), _mm_shuffle_ps(y, y, s2));
00275
00276
00277
00278 __m128 xb = _mm_mul_ps(_mm_shuffle_ps(x, x, s2), _mm_shuffle_ps(y, y, s1));
00279
00280
00281 return _mm_sub_ps(xa, xb);
00282 }
00283
00284 static inline sse_meta_f4 cross_prod(const sse_meta_f4& x, const sse_meta_f4& y)
00285 {
00286 return sse_meta_f4(cross_prod(x.v, y.v));
00287 }
00288
00289 static inline void cross_prod(__m128d x0, __m128d x1, __m128d y0, __m128d y1, __m128d* z0, __m128d* z1)
00290 {
00291 static const int s0 = _MM_SHUFFLE2(0, 0);
00292 static const int s1 = _MM_SHUFFLE2(0, 1);
00293 static const int s2 = _MM_SHUFFLE2(1, 0);
00294 static const int s3 = _MM_SHUFFLE2(1, 1);
00295 __m128d xa1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s1), _mm_shuffle_pd(y1, y0, s0));
00296 __m128d ya1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s2), _mm_shuffle_pd(y0, y1, s3));
00297
00298 __m128d xa2 = _mm_mul_pd(_mm_shuffle_pd(x1, x0, s0), _mm_shuffle_pd(y0, y1, s1));
00299 __m128d ya2 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s3), _mm_shuffle_pd(y0, y1, s2));
00300
00301 *z0 = _mm_sub_pd(xa1, xa2);
00302 *z1 = _mm_sub_pd(ya1, ya2);
00303 }
00304
00305 static inline sse_meta_d4 cross_prod(const sse_meta_d4& x, const sse_meta_d4& y)
00306 {
00307 __m128d z0, z1;
00308 cross_prod(x.v[0], x.v[1], y.v[0], y.v[1], &z0, &z1);
00309 return sse_meta_d4(z0, z1);
00310 }
00311
00312
00313 static inline __m128 dot_prod3(__m128 x, __m128 y)
00314 {
00315 register __m128 m = _mm_mul_ps(x, y);
00316 return _mm_add_ps(_mm_shuffle_ps(m, m, _MM_SHUFFLE(0, 0, 0, 0)),
00317 _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
00318 }
00319
00320 static inline float dot_prod3(const sse_meta_f4& x, const sse_meta_f4& y)
00321 {
00322 return _mm_cvtss_f32(dot_prod3(x.v, y.v));
00323 }
00324
00325
00326 static inline __m128d dot_prod3(__m128d x0, __m128d x1, __m128d y0, __m128d y1)
00327 {
00328 register __m128d m1 = _mm_mul_pd(x0, y0);
00329 register __m128d m2 = _mm_mul_pd(x1, y1);
00330 return _mm_add_pd(_mm_add_pd(vec_splat(m1, 0), vec_splat(m1, 1)), vec_splat(m2, 0));
00331 }
00332
00333 static inline double dot_prod3(const sse_meta_d4& x, const sse_meta_d4& y)
00334 {
00335 double d;
00336 _mm_storel_pd(&d, dot_prod3(x.v[0], x.v[1], y.v[0], y.v[1]));
00337 return d;
00338 }
00339
00340 static inline __m128 dot_prod4(__m128 x, __m128 y)
00341 {
00342 #if defined (__SSE4__)
00343 return _mm_dp_ps(x, y, 0x71);
00344 #elif defined (__SSE3__)
00345 register __m128 t = _mm_mul_ps(x, y);
00346 t = _mm_hadd_ps(t, t);
00347 return _mm_hadd_ps(t, t);
00348 #else
00349 register __m128 s = _mm_mul_ps(x, y);
00350 register __m128 r = _mm_add_ss(s, _mm_movehl_ps(s, s));
00351 return _mm_add_ss(r, _mm_shuffle_ps(r, r, 1));
00352 #endif
00353 }
00354
00355
00356 static inline float dot_prod4(const sse_meta_f4& x, const sse_meta_f4& y)
00357 {
00358 return _mm_cvtss_f32(dot_prod4(x.v, y.v));
00359 }
00360
00361 static inline __m128d dot_prod4(__m128d x0, __m128d x1, __m128d y0, __m128d y1)
00362 {
00363 #if defined (__SSE4__)
00364 register __m128d t1 = _mm_dp_pd(x0, y0, 0x31);
00365 register __m128d t2 = _mm_dp_pd(x1, y1, 0x11);
00366 return _mm_add_pd(t1, t2);
00367 #elif defined (__SSE3__)
00368 register __m128d t1 = _mm_mul_pd(x0, y0);
00369 register __m128d t2 = _mm_mul_pd(x1, y1);
00370 t1 = _mm_hadd_pd(t1, t1);
00371 t2 = _mm_hadd_pd(t2, t2);
00372 return _mm_add_pd(t1, t2);
00373 #else
00374 register __m128d t1 = _mm_mul_pd(x0, y0);
00375 register __m128d t2 = _mm_mul_pd(x1, y1);
00376 t1 = _mm_add_pd(t1, t2);
00377 return _mm_add_pd(t1, _mm_shuffle_pd(t1, t1, 1));
00378 #endif
00379 }
00380
00381 static inline double dot_prod4(const sse_meta_d4& x, const sse_meta_d4& y)
00382 {
00383 double d;
00384 _mm_storel_pd(&d, dot_prod4(x.v[0], x.v[1], y.v[0], y.v[1]));
00385 return d;
00386 }
00387
00388 static inline sse_meta_f4 min(const sse_meta_f4& x, const sse_meta_f4& y)
00389 {
00390 return sse_meta_f4(_mm_min_ps(x.v, y.v));
00391 }
00392
00393 static inline sse_meta_d4 min(const sse_meta_d4& x, const sse_meta_d4& y)
00394 {
00395 return sse_meta_d4(_mm_min_pd(x.v[0], y.v[0]), _mm_min_pd(x.v[1], y.v[1]));
00396 }
00397
00398 static inline sse_meta_f4 max(const sse_meta_f4& x, const sse_meta_f4& y)
00399 {
00400 return sse_meta_f4(_mm_max_ps(x.v, y.v));
00401 }
00402
00403 static inline sse_meta_d4 max(const sse_meta_d4& x, const sse_meta_d4& y)
00404 {
00405 return sse_meta_d4(_mm_max_pd(x.v[0], y.v[0]), _mm_max_pd(x.v[1], y.v[1]));
00406 }
00407
00408 static inline sse_meta_f4 abs(const sse_meta_f4& x)
00409 {
00410 static const union { int i[4]; __m128 m; } abs4mask __attribute__ ((aligned (16))) = {{0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}};
00411 return sse_meta_f4(_mm_and_ps(x.v, abs4mask.m));
00412 }
00413
00414 static inline sse_meta_d4 abs(const sse_meta_d4& x)
00415 {
00416 static const union { FCL_INT64 i[2]; __m128d m; } abs2mask __attribute__ ((aligned (16))) = {{0x7fffffffffffffff, 0x7fffffffffffffff}};
00417 return sse_meta_d4(_mm_and_pd(x.v[0], abs2mask.m), _mm_and_pd(x.v[1], abs2mask.m));
00418 }
00419
00420 static inline bool equal(const sse_meta_f4& x, const sse_meta_f4& y, float epsilon)
00421 {
00422 register __m128 d = _mm_sub_ps(x.v, y.v);
00423 register __m128 e = _mm_set1_ps(epsilon);
00424 return ((_mm_movemask_ps(_mm_cmplt_ps(d, e)) & 0x7) == 0x7) && ((_mm_movemask_ps(_mm_cmpgt_ps(d, _mm_sub_ps(xmms_0, e))) & 0x7) == 0x7);
00425 }
00426
00427 static inline bool equal(const sse_meta_d4& x, const sse_meta_d4& y, double epsilon)
00428 {
00429 register __m128d d = _mm_sub_pd(x.v[0], y.v[0]);
00430 register __m128d e = _mm_set1_pd(epsilon);
00431
00432 if(_mm_movemask_pd(_mm_cmplt_pd(d, e)) != 0x3) return false;
00433 if(_mm_movemask_pd(_mm_cmpgt_pd(d, _mm_sub_pd(xmmd_0, e))) != 0x3) return false;
00434
00435 d = _mm_sub_pd(x.v[1], y.v[1]);
00436 if((_mm_movemask_pd(_mm_cmplt_pd(d, e)) & 0x1) != 0x1) return false;
00437 if((_mm_movemask_pd(_mm_cmpgt_pd(d, _mm_sub_pd(xmmd_0, e))) & 0x1) != 0x1) return false;
00438 return true;
00439 }
00440
00441 static inline sse_meta_f4 normalize3(const sse_meta_f4& x)
00442 {
00443 register __m128 m = _mm_mul_ps(x.v, x.v);
00444 __m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
00445 return sse_meta_f4(_mm_mul_ps(x.v, newtonraphson_rsqrt4(r)));
00446 }
00447
00448 static inline sse_meta_f4 normalize3_approx(const sse_meta_f4& x)
00449 {
00450 register __m128 m = _mm_mul_ps(x.v, x.v);
00451 __m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
00452 return sse_meta_f4(_mm_mul_ps(x.v, _mm_rsqrt_ps(r)));
00453 }
00454
00455
00456 static inline void transpose(__m128 c0, __m128 c1, __m128 c2, __m128* r0, __m128* r1, __m128* r2)
00457 {
00458 static const union { unsigned int i[4]; __m128 m; } selectmask __attribute__ ((aligned(16))) = {{0, 0xffffffff, 0, 0}};
00459 register __m128 t0, t1;
00460 t0 = _mm_unpacklo_ps(c0, c2);
00461 t1 = _mm_unpackhi_ps(c0, c2);
00462 *r0 = _mm_unpacklo_ps(t0, c1);
00463 *r1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 3, 2, 2));
00464 *r1 = vec_sel(*r1, c1, selectmask.i);
00465 *r2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 1, 1, 0));
00466 *r2 = vec_sel(*r2, vec_splat(c1, 2), selectmask.i);
00467 }
00468
00469
00470 static inline void inverse(__m128 c0, __m128 c1, __m128 c2, __m128* i0, __m128* i1, __m128* i2)
00471 {
00472 __m128 t0, t1, t2, d, invd;
00473 t2 = cross_prod(c0, c1);
00474 t0 = cross_prod(c1, c2);
00475 t1 = cross_prod(c2, c0);
00476 d = dot_prod3(t2, c2);
00477 d = vec_splat(d, 0);
00478 invd = _mm_rcp_ps(d);
00479 transpose(t0, t1, t2, i0, i1, i2);
00480 *i0 = _mm_mul_ps(*i0, invd);
00481 *i1 = _mm_mul_ps(*i1, invd);
00482 *i2 = _mm_mul_ps(*i2, invd);
00483 }
00484
00485
00486 struct sse_meta_f12
00487 {
00488 typedef float meta_type;
00489 typedef sse_meta_f4 vector_type;
00490 sse_meta_f4 c[3];
00491
00492 sse_meta_f12() { setZero(); }
00493
00494 sse_meta_f12(float xx, float xy, float xz,
00495 float yx, float yy, float yz,
00496 float zx, float zy, float zz)
00497 { setValue(xx, xy, xz, yx, yy, yz, zx, zy, zz); }
00498
00499 sse_meta_f12(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
00500 { setColumn(x, y, z); }
00501
00502 sse_meta_f12(__m128 x, __m128 y, __m128 z)
00503 { setColumn(x, y, z); }
00504
00505 inline void setValue(float xx, float xy, float xz,
00506 float yx, float yy, float yz,
00507 float zx, float zy, float zz)
00508 {
00509 c[0].setValue(xx, yx, zx, 0);
00510 c[1].setValue(xy, yy, zy, 0);
00511 c[2].setValue(xz, yz, zz, 0);
00512 }
00513
00514 inline void setIdentity()
00515 {
00516 c[0].setValue(1, 0, 0, 0);
00517 c[1].setValue(0, 1, 0, 0);
00518 c[2].setValue(0, 0, 1, 0);
00519 }
00520
00521 inline void setZero()
00522 {
00523 c[0].setValue(0);
00524 c[1].setValue(0);
00525 c[2].setValue(0);
00526 }
00527
00528 inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
00529 {
00530 c[0] = x; c[1] = y; c[2] = z;
00531 }
00532
00533 inline void setColumn(__m128 x, __m128 y, __m128 z)
00534 {
00535 c[0].setValue(x); c[1].setValue(y); c[2].setValue(z);
00536 }
00537
00538 inline const sse_meta_f4& getColumn(size_t i) const
00539 {
00540 return c[i];
00541 }
00542
00543 inline sse_meta_f4& getColumn(size_t i)
00544 {
00545 return c[i];
00546 }
00547
00548 inline sse_meta_f4 getRow(size_t i) const
00549 {
00550 return sse_meta_f4(c[0][i], c[1][i], c[2][i], 0);
00551 }
00552
00553 inline float operator () (size_t i, size_t j) const
00554 {
00555 return c[j][i];
00556 }
00557
00558 inline float& operator () (size_t i, size_t j)
00559 {
00560 return c[j][i];
00561 }
00562
00563 inline sse_meta_f4 operator * (const sse_meta_f4& v) const
00564 {
00565 return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))), _mm_mul_ps(c[2].v, vec_splat(v.v, 2))));
00566 }
00567
00568 inline sse_meta_f12 operator * (const sse_meta_f12& mat) const
00569 {
00570 return sse_meta_f12((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2]);
00571 }
00572
00573 inline sse_meta_f12 operator + (const sse_meta_f12& mat) const
00574 {
00575 return sse_meta_f12(c[0] + mat.c[0], c[1] + mat.c[1], c[2] + mat.c[2]);
00576 }
00577
00578 inline sse_meta_f12 operator - (const sse_meta_f12& mat) const
00579 {
00580 return sse_meta_f12(c[0] - mat.c[0], c[1] - mat.c[1], c[2] - mat.c[2]);
00581 }
00582
00583 inline sse_meta_f12 operator + (float t_) const
00584 {
00585 sse_meta_f4 t(t_);
00586 return sse_meta_f12(c[0] + t, c[1] + t, c[2] + t);
00587 }
00588
00589 inline sse_meta_f12 operator - (float t_) const
00590 {
00591 sse_meta_f4 t(t_);
00592 return sse_meta_f12(c[0] - t, c[1] - t, c[2] - t);
00593 }
00594
00595 inline sse_meta_f12 operator * (float t_) const
00596 {
00597 sse_meta_f4 t(t_);
00598 return sse_meta_f12(c[0] * t, c[1] * t, c[2] * t);
00599 }
00600
00601 inline sse_meta_f12 operator / (float t_) const
00602 {
00603 sse_meta_f4 t(t_);
00604 return sse_meta_f12(c[0] / t, c[1] / t, c[2] / t);
00605 }
00606
00607 inline sse_meta_f12& operator *= (const sse_meta_f12& mat)
00608 {
00609 setColumn((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2]);
00610 return *this;
00611 }
00612
00613 inline sse_meta_f12& operator += (const sse_meta_f12& mat)
00614 {
00615 c[0] += mat.c[0];
00616 c[1] += mat.c[1];
00617 c[2] += mat.c[2];
00618 return *this;
00619 }
00620
00621 inline sse_meta_f12& operator -= (const sse_meta_f12& mat)
00622 {
00623 c[0] -= mat.c[0];
00624 c[1] -= mat.c[1];
00625 c[2] -= mat.c[2];
00626 return *this;
00627 }
00628
00629 inline sse_meta_f12& operator += (float t_)
00630 {
00631 sse_meta_f4 t(t_);
00632 c[0] += t;
00633 c[1] += t;
00634 c[2] += t;
00635 return *this;
00636 }
00637
00638 inline sse_meta_f12& operator -= (float t_)
00639 {
00640 sse_meta_f4 t(t_);
00641 c[0] -= t;
00642 c[1] -= t;
00643 c[2] -= t;
00644 return *this;
00645 }
00646
00647 inline sse_meta_f12& operator *= (float t_)
00648 {
00649 sse_meta_f4 t(t_);
00650 c[0] *= t;
00651 c[1] *= t;
00652 c[2] *= t;
00653 return *this;
00654 }
00655
00656 inline sse_meta_f12& operator /= (float t_)
00657 {
00658 sse_meta_f4 t(t_);
00659 c[0] /= t;
00660 c[1] /= t;
00661 c[2] /= t;
00662 return *this;
00663 }
00664
00665 inline sse_meta_f12& inverse()
00666 {
00667 __m128 inv0, inv1, inv2;
00668 details::inverse(c[0].v, c[1].v, c[2].v, &inv0, &inv1, &inv2);
00669 setColumn(inv0, inv1, inv2);
00670 return *this;
00671 }
00672
00673 inline sse_meta_f12& transpose()
00674 {
00675 __m128 r0, r1, r2;
00676 details::transpose(c[0].v, c[1].v, c[2].v, &r0, &r1, &r2);
00677 setColumn(r0, r1, r2);
00678 return *this;
00679 }
00680
00681 inline sse_meta_f12& abs()
00682 {
00683 c[0] = details::abs(c[0]);
00684 c[1] = details::abs(c[1]);
00685 c[2] = details::abs(c[2]);
00686 return *this;
00687 }
00688
00689 inline float determinant() const
00690 {
00691 return _mm_cvtss_f32(dot_prod3(c[2].v, cross_prod(c[0].v, c[1].v)));
00692 }
00693
00694 inline sse_meta_f12 transposeTimes(const sse_meta_f12& other) const
00695 {
00696 return sse_meta_f12(dot_prod3(c[0], other.c[0]), dot_prod3(c[0], other.c[1]), dot_prod3(c[0], other.c[2]),
00697 dot_prod3(c[1], other.c[0]), dot_prod3(c[1], other.c[1]), dot_prod3(c[1], other.c[2]),
00698 dot_prod3(c[2], other.c[0]), dot_prod3(c[2], other.c[1]), dot_prod3(c[2], other.c[2]));
00699 }
00700
00701 inline sse_meta_f12 timesTranspose(const sse_meta_f12& m) const
00702 {
00703 sse_meta_f12 tmp(m);
00704 return (*this) * tmp.transpose();
00705 }
00706
00707 inline sse_meta_f4 transposeTimes(const sse_meta_f4& v) const
00708 {
00709 return sse_meta_f4(dot_prod3(c[0], v), dot_prod3(c[1], v), dot_prod3(c[2], v));
00710 }
00711
00712 inline float transposeDot(size_t i, const sse_meta_f4& v) const
00713 {
00714 return dot_prod3(c[i], v);
00715 }
00716
00717 inline float dot(size_t i, const sse_meta_f4& v) const
00718 {
00719 return v[0] * c[0][i] + v[1] * c[1][i] + v[2] * c[2][i];
00720 }
00721
00722 };
00723
00724 static inline sse_meta_f12 abs(const sse_meta_f12& mat)
00725 {
00726 return sse_meta_f12(abs(mat.getColumn(0)), abs(mat.getColumn(1)), abs(mat.getColumn(2)));
00727 }
00728
00729 static inline sse_meta_f12 transpose(const sse_meta_f12& mat)
00730 {
00731 __m128 r0, r1, r2;
00732 transpose(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &r0, &r1, &r2);
00733 return sse_meta_f12(r0, r1, r2);
00734 }
00735
00736
00737 static inline sse_meta_f12 inverse(const sse_meta_f12& mat)
00738 {
00739 __m128 inv0, inv1, inv2;
00740 inverse(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &inv0, &inv1, &inv2);
00741 return sse_meta_f12(inv0, inv1, inv2);
00742 }
00743
00744
00745 static inline void transpose(__m128 c0, __m128 c1, __m128 c2, __m128 c3,
00746 __m128* r0, __m128* r1, __m128* r2, __m128* r3)
00747 {
00748 __m128 tmp0 = _mm_unpacklo_ps(c0, c2);
00749 __m128 tmp1 = _mm_unpacklo_ps(c1, c3);
00750 __m128 tmp2 = _mm_unpackhi_ps(c0, c2);
00751 __m128 tmp3 = _mm_unpackhi_ps(c1, c3);
00752 *r0 = _mm_unpacklo_ps(tmp0, tmp1);
00753 *r1 = _mm_unpackhi_ps(tmp0, tmp1);
00754 *r2 = _mm_unpacklo_ps(tmp2, tmp3);
00755 *r3 = _mm_unpackhi_ps(tmp2, tmp3);
00756 }
00757
00758
00759 static inline void inverse(__m128 c0, __m128 c1, __m128 c2, __m128 c3,
00760 __m128* res0, __m128* res1, __m128* res2, __m128* res3)
00761 {
00762 __m128 Va, Vb, Vc;
00763 __m128 r1, r2, r3, tt, tt2;
00764 __m128 sum, Det, RDet;
00765 __m128 trns0, trns1, trns2, trns3;
00766
00767
00768
00769 tt = c3; tt2 = _mm_ror_ps(c2,1);
00770 Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));
00771 Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));
00772 Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));
00773
00774 r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));
00775 r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));
00776 r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));
00777
00778 tt = c1;
00779 Va = _mm_ror_ps(tt,1); sum = _mm_mul_ps(Va,r1);
00780 Vb = _mm_ror_ps(tt,2); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
00781 Vc = _mm_ror_ps(tt,3); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
00782
00783
00784 Det = _mm_mul_ps(sum,c0);
00785 Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
00786
00787 static const union { int i[4]; __m128 m; } Sign_PNPN __attribute__ ((aligned(16))) = {{0x00000000, 0x80000000, 0x00000000, 0x80000000}};
00788 static const union { int i[4]; __m128 m; } Sign_NPNP __attribute__ ((aligned(16))) = {{0x80000000, 0x00000000, 0x80000000, 0x00000000}};
00789 static const union { float i[4]; __m128 m; } ZERONE __attribute__ ((aligned(16))) = {{1.0f, 0.0f, 0.0f, 1.0f}};
00790
00791 __m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN.m);
00792
00793
00794 tt = _mm_ror_ps(c0,1); sum = _mm_mul_ps(tt,r1);
00795 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
00796 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
00797 __m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP.m);
00798
00799
00800 Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
00801
00802
00803 tt = _mm_ror_ps(c0,1);
00804 Va = _mm_mul_ps(tt,Vb);
00805 Vb = _mm_mul_ps(tt,Vc);
00806 Vc = _mm_mul_ps(tt,c1);
00807
00808 r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));
00809 r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));
00810 r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));
00811
00812 tt = _mm_ror_ps(c3,1); sum = _mm_mul_ps(tt,r1);
00813 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
00814 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
00815 __m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN.m);
00816
00817
00818 RDet = _mm_div_ss(ZERONE.m, Det);
00819 RDet = _mm_shuffle_ps(RDet,RDet,0x00);
00820
00821
00822 mtL1 = _mm_mul_ps(mtL1, RDet);
00823 mtL2 = _mm_mul_ps(mtL2, RDet);
00824 mtL3 = _mm_mul_ps(mtL3, RDet);
00825
00826
00827 tt = _mm_ror_ps(c2,1); sum = _mm_mul_ps(tt,r1);
00828 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
00829 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
00830 __m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP.m);
00831 mtL4 = _mm_mul_ps(mtL4, RDet);
00832
00833
00834 trns0 = _mm_unpacklo_ps(mtL1,mtL2);
00835 trns1 = _mm_unpacklo_ps(mtL3,mtL4);
00836 trns2 = _mm_unpackhi_ps(mtL1,mtL2);
00837 trns3 = _mm_unpackhi_ps(mtL3,mtL4);
00838 *res0 = _mm_movelh_ps(trns0,trns1);
00839 *res1 = _mm_movehl_ps(trns1,trns0);
00840 *res2 = _mm_movelh_ps(trns2,trns3);
00841 *res3 = _mm_movehl_ps(trns3,trns2);
00842 }
00843
00844
00845 struct sse_meta_f16
00846 {
00847 typedef float meta_type;
00848 typedef sse_meta_f4 vector_type;
00849 sse_meta_f4 c[4];
00850
00851 sse_meta_f16() { setZero(); }
00852
00853 sse_meta_f16(float xx, float xy, float xz, float xw,
00854 float yx, float yy, float yz, float yw,
00855 float zx, float zy, float zz, float zw,
00856 float wx, float wy, float wz, float ww)
00857 { setValue(xx, xy, xz, xw, yz, yy, yz, yw, zx, zy, zz, zw, wx, wy, wz, ww); }
00858
00859 sse_meta_f16(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w)
00860 { setColumn(x, y, z, w); }
00861
00862 sse_meta_f16(__m128 x, __m128 y, __m128 z, __m128 w)
00863 { setColumn(x, y, z, w); }
00864
00865 inline void setValue(float xx, float xy, float xz, float xw,
00866 float yx, float yy, float yz, float yw,
00867 float zx, float zy, float zz, float zw,
00868 float wx, float wy, float wz, float ww)
00869 {
00870 c[0].setValue(xx, yx, zx, wx);
00871 c[1].setValue(xy, yy, zy, wy);
00872 c[2].setValue(xz, yz, zz, wz);
00873 c[3].setValue(xw, yw, zw, ww);
00874 }
00875
00876 inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w)
00877 {
00878 c[0] = x; c[1] = y; c[2] = z; c[3] = w;
00879 }
00880
00881 inline void setColumn(__m128 x, __m128 y, __m128 z, __m128 w)
00882 {
00883 c[0].setValue(x); c[1].setValue(y); c[2].setValue(z); c[3].setValue(w);
00884 }
00885
00886 inline void setIdentity()
00887 {
00888 c[0].setValue(1, 0, 0, 0);
00889 c[1].setValue(0, 1, 0, 0);
00890 c[2].setValue(0, 0, 1, 0);
00891 c[3].setValue(0, 0, 0, 1);
00892 }
00893
00894 inline void setZero()
00895 {
00896 c[0].setValue(0);
00897 c[1].setValue(0);
00898 c[2].setValue(0);
00899 c[3].setValue(0);
00900 }
00901
00902 inline const sse_meta_f4& getColumn(size_t i) const
00903 {
00904 return c[i];
00905 }
00906
00907 inline sse_meta_f4& getColumn(size_t i)
00908 {
00909 return c[i];
00910 }
00911
00912 inline sse_meta_f4 getRow(size_t i) const
00913 {
00914 return sse_meta_f4(c[0][i], c[1][i], c[2][i], c[3][i]);
00915 }
00916
00917 inline float operator () (size_t i, size_t j) const
00918 {
00919 return c[j][i];
00920 }
00921
00922 inline float& operator () (size_t i, size_t j)
00923 {
00924 return c[j][i];
00925 }
00926
00927 inline sse_meta_f4 operator * (const sse_meta_f4& v) const
00928 {
00929 return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))),
00930 _mm_add_ps(_mm_mul_ps(c[2].v, vec_splat(v.v, 2)), _mm_mul_ps(c[3].v, vec_splat(v.v, 3)))
00931 ));
00932 }
00933
00934 inline sse_meta_f16 operator * (const sse_meta_f16& mat) const
00935 {
00936 return sse_meta_f16((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2], (*this) * mat.c[3]);
00937 }
00938
00939
00940 inline sse_meta_f16 operator + (const sse_meta_f16& mat) const
00941 {
00942 return sse_meta_f16(c[0] + mat.c[0], c[1] + mat.c[1], c[2] + mat.c[2], c[3] + mat.c[3]);
00943 }
00944
00945 inline sse_meta_f16 operator - (const sse_meta_f16& mat) const
00946 {
00947 return sse_meta_f16(c[0] - mat.c[0], c[1] - mat.c[1], c[2] - mat.c[2], c[3] - mat.c[3]);
00948 }
00949
00950 inline sse_meta_f16 operator + (float t_) const
00951 {
00952 sse_meta_f4 t(t_);
00953 return sse_meta_f16(c[0] + t, c[1] + t, c[2] + t, c[3] + t);
00954 }
00955
00956 inline sse_meta_f16 operator - (float t_) const
00957 {
00958 sse_meta_f4 t(t_);
00959 return sse_meta_f16(c[0] - t, c[1] - t, c[2] - t, c[3] - t);
00960 }
00961
00962 inline sse_meta_f16 operator * (float t_) const
00963 {
00964 sse_meta_f4 t(t_);
00965 return sse_meta_f16(c[0] * t, c[1] * t, c[2] * t, c[3] * t);
00966 }
00967
00968 inline sse_meta_f16 operator / (float t_) const
00969 {
00970 sse_meta_f4 t(t_);
00971 return sse_meta_f16(c[0] / t, c[1] / t, c[2] / t, c[3] / t);
00972 }
00973
00974 inline sse_meta_f16& operator *= (const sse_meta_f16& mat)
00975 {
00976 setColumn((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2], (*this) * mat.c[3]);
00977 return *this;
00978 }
00979
00980 inline sse_meta_f16& operator += (const sse_meta_f16& mat)
00981 {
00982 c[0] += mat.c[0];
00983 c[1] += mat.c[1];
00984 c[2] += mat.c[2];
00985 c[3] += mat.c[3];
00986 return *this;
00987 }
00988
00989 inline sse_meta_f16& operator -= (const sse_meta_f16& mat)
00990 {
00991 c[0] -= mat.c[0];
00992 c[1] -= mat.c[1];
00993 c[2] -= mat.c[2];
00994 c[3] -= mat.c[3];
00995 return *this;
00996 }
00997
00998 inline sse_meta_f16& operator += (float t_)
00999 {
01000 sse_meta_f4 t(t_);
01001 c[0] += t;
01002 c[1] += t;
01003 c[2] += t;
01004 c[3] += t;
01005 return *this;
01006 }
01007
01008 inline sse_meta_f16& operator -= (float t_)
01009 {
01010 sse_meta_f4 t(t_);
01011 c[0] -= t;
01012 c[1] -= t;
01013 c[2] -= t;
01014 c[3] -= t;
01015 return *this;
01016 }
01017
01018 inline sse_meta_f16& operator *= (float t_)
01019 {
01020 sse_meta_f4 t(t_);
01021 c[0] *= t;
01022 c[1] *= t;
01023 c[2] *= t;
01024 c[3] *= t;
01025 return *this;
01026 }
01027
01028 inline sse_meta_f16& operator /= (float t_)
01029 {
01030 sse_meta_f4 t(t_);
01031 c[0] /= t;
01032 c[1] /= t;
01033 c[2] /= t;
01034 c[3] /= t;
01035 return *this;
01036 }
01037
01038 inline sse_meta_f16& abs()
01039 {
01040 c[0] = details::abs(c[0]);
01041 c[1] = details::abs(c[1]);
01042 c[2] = details::abs(c[2]);
01043 c[3] = details::abs(c[3]);
01044 return *this;
01045 }
01046
01047 inline sse_meta_f16& inverse()
01048 {
01049 __m128 r0, r1, r2, r3;
01050 details::inverse(c[0].v, c[1].v, c[2].v, c[3].v, &r0, &r1, &r2, &r3);
01051 setColumn(r0, r1, r2, r3);
01052 return *this;
01053 }
01054
01055 inline sse_meta_f16& transpose()
01056 {
01057 __m128 r0, r1, r2, r3;
01058 details::transpose(c[0].v, c[1].v, c[2].v, c[3].v, &r0, &r1, &r2, &r3);
01059 setColumn(r0, r1, r2, r3);
01060 return *this;
01061 }
01062
01063 inline float determinant() const
01064 {
01065 __m128 Va, Vb, Vc;
01066 __m128 r1, r2, r3, tt, tt2;
01067 __m128 sum, Det;
01068
01069 __m128 _L1 = c[0].v;
01070 __m128 _L2 = c[1].v;
01071 __m128 _L3 = c[2].v;
01072 __m128 _L4 = c[3].v;
01073
01074
01075
01076 tt = _L4; tt2 = _mm_ror_ps(_L3,1);
01077 Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));
01078 Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));
01079 Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));
01080
01081 r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));
01082 r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));
01083 r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));
01084
01085 tt = _L2;
01086 Va = _mm_ror_ps(tt,1); sum = _mm_mul_ps(Va,r1);
01087 Vb = _mm_ror_ps(tt,2); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
01088 Vc = _mm_ror_ps(tt,3); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
01089
01090
01091 Det = _mm_mul_ps(sum,_L1);
01092 Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
01093
01094
01095 tt = _mm_ror_ps(_L1,1); sum = _mm_mul_ps(tt,r1);
01096 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
01097 tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
01098
01099
01100 Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
01101 return _mm_cvtss_f32(Det);
01102 }
01103
01104 inline sse_meta_f16 transposeTimes(const sse_meta_f16& other) const
01105 {
01106 return sse_meta_f16(dot_prod4(c[0], other.c[0]), dot_prod4(c[0], other.c[1]), dot_prod4(c[0], other.c[2]), dot_prod4(c[0], other.c[3]),
01107 dot_prod4(c[1], other.c[0]), dot_prod4(c[1], other.c[1]), dot_prod4(c[1], other.c[2]), dot_prod4(c[1], other.c[3]),
01108 dot_prod4(c[2], other.c[0]), dot_prod4(c[2], other.c[1]), dot_prod4(c[2], other.c[2]), dot_prod4(c[2], other.c[3]),
01109 dot_prod4(c[3], other.c[0]), dot_prod4(c[3], other.c[1]), dot_prod4(c[3], other.c[2]), dot_prod4(c[3], other.c[3]));
01110 }
01111
01112 inline sse_meta_f16 timesTranspose(const sse_meta_f16& m) const
01113 {
01114 sse_meta_f16 tmp(m);
01115 return (*this) * tmp.transpose();
01116 }
01117
01118 inline sse_meta_f4 transposeTimes(const sse_meta_f4& v) const
01119 {
01120 return sse_meta_f4(dot_prod4(c[0], v), dot_prod4(c[1], v), dot_prod4(c[2], v), dot_prod4(c[3], v));
01121 }
01122
01123 inline float transposeDot(size_t i, const sse_meta_f4& v) const
01124 {
01125 return dot_prod4(c[i], v);
01126 }
01127
01128 inline float dot(size_t i, const sse_meta_f4& v) const
01129 {
01130 return v[0] * c[0][i] + v[1] * c[1][i] + v[2] * c[2][i] + v[3] * c[3][i];
01131 }
01132
01133 };
01134
01135 static inline sse_meta_f16 abs(const sse_meta_f16& mat)
01136 {
01137 return sse_meta_f16(abs(mat.getColumn(0)), abs(mat.getColumn(1)), abs(mat.getColumn(2)), abs(mat.getColumn(3)));
01138 }
01139
01140 static inline sse_meta_f16 transpose(const sse_meta_f16& mat)
01141 {
01142 __m128 r0, r1, r2, r3;
01143 transpose(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, mat.getColumn(3).v, &r0, &r1, &r2, &r3);
01144 return sse_meta_f16(r0, r1, r2, r3);
01145 }
01146
01147
01148 static inline sse_meta_f16 inverse(const sse_meta_f16& mat)
01149 {
01150 __m128 r0, r1, r2, r3;
01151 inverse(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, mat.getColumn(3).v, &r0, &r1, &r2, &r3);
01152 return sse_meta_f16(r0, r1, r2, r3);
01153 }
01154
01155
01156
01157
01158 }
01159 }
01160
01161
01162 #endif