From e68ded92af72653b87a09c95cd271cd66d799f62 Mon Sep 17 00:00:00 2001 From: jpan <jpan@253336fb-580f-4252-a368-f3cef5a2a82b> Date: Mon, 11 Jun 2012 23:10:30 +0000 Subject: [PATCH] git-svn-id: https://kforge.ros.org/fcl/fcl_ros@97 253336fb-580f-4252-a368-f3cef5a2a82b --- trunk/fcl/include/fcl/math_details.h | 2 +- trunk/fcl/include/fcl/math_simd_details.h | 559 ++++++++++++++++++---- trunk/fcl/include/fcl/vec_3f.h | 10 +- 3 files changed, 476 insertions(+), 95 deletions(-) diff --git a/trunk/fcl/include/fcl/math_details.h b/trunk/fcl/include/fcl/math_details.h index b5b8a3aa..ae29c78c 100644 --- a/trunk/fcl/include/fcl/math_details.h +++ b/trunk/fcl/include/fcl/math_details.h @@ -116,7 +116,7 @@ static inline Vec3Data<T> cross_prod(const Vec3Data<T>& l, const Vec3Data<T>& r) } template <typename T> -static inline T dot_prod(const Vec3Data<T>& l, const Vec3Data<T>& r) +static inline T dot_prod3(const Vec3Data<T>& l, const Vec3Data<T>& r) { return l.vs[0] * r.vs[0] + l.vs[1] * r.vs[1] + l.vs[2] * r.vs[2]; } diff --git a/trunk/fcl/include/fcl/math_simd_details.h b/trunk/fcl/include/fcl/math_simd_details.h index c2cdc3d7..d71196d4 100644 --- a/trunk/fcl/include/fcl/math_simd_details.h +++ b/trunk/fcl/include/fcl/math_simd_details.h @@ -57,6 +57,40 @@ namespace details const __m128 xmms_0 = {0.f, 0.f, 0.f, 0.f}; const __m128d xmmd_0 = {0, 0}; +static inline __m128 vec_sel(__m128 a, __m128 b, __m128 mask) +{ + return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a)); +} + +static inline __m128 vec_splat(__m128 a, int e) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(e, e, e, e)); +} + +static inline __m128d vec_splat(__m128d a, int e) +{ + return _mm_shuffle_pd(a, a, _MM_SHUFFLE2(e, e)); +} + +static inline __m128 _mm_ror_ps(__m128 x, int e) +{ + return (e % 4) ? _mm_shuffle_ps(x, x, _MM_SHUFFLE((e+3)%4, (e+2)%4, (e+1)%4, e%4)) : x; +} + +static inline __m128 _mm_rol_ps(__m128 x, int e) +{ + return (e % 4) ? _mm_shuffle_ps(x, x, _MM_SHUFFLE((7-e)%4, (6-e)%4, (5-e)%4, (4-e)%4)) : x; +} + +static inline __m128 newtonraphson_rsqrt4(const __m128 v) +{ + static const union { float i[4]; __m128 m; } _half4 __attribute__ ((aligned(16))) = {{.5f, .5f, .5f, .5f}}; + static const union { float i[4]; __m128 m; } _three __attribute__ ((aligned(16))) = {{3.f, 3.f, 3.f, 3.f}}; + __m128 approx = _mm_rsqrt_ps(v); + __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx); + return _mm_mul_ps(_mm_mul_ps(_half4.m, approx), _mm_sub_ps(_three.m, muls)); +} + struct sse_meta_f4 { typedef float meta_type; @@ -93,7 +127,11 @@ struct sse_meta_f4 inline sse_meta_f4& operator -= (float t) { v = _mm_sub_ps(v, _mm_set1_ps(t)); return *this; } inline sse_meta_f4& operator *= (float t) { v = _mm_mul_ps(v, _mm_set1_ps(t)); return *this; } inline sse_meta_f4& operator /= (float t) { v = _mm_div_ps(v, _mm_set1_ps(t)); return *this; } - inline sse_meta_f4 operator - () const { return sse_meta_f4(_mm_sub_ps(xmms_0, v)); } + inline sse_meta_f4 operator - () const + { + static const union { int i[4]; __m128 m; } negativemask __attribute__ ((aligned(16))) = {{0x80000000, 0x80000000, 0x80000000, 0x80000000}}; + return sse_meta_f4(_mm_xor_ps(negativemask.m, v)); + } } __attribute__ ((aligned (16))); struct sse_meta_d4 @@ -181,87 +219,135 @@ struct sse_meta_d4 inline sse_meta_d4& operator -= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_sub_pd(v[0], d); v[1] = _mm_sub_pd(v[1], d); return *this; } inline sse_meta_d4& operator *= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_mul_pd(v[0], d); v[1] = _mm_mul_pd(v[1], d); return *this; } inline sse_meta_d4& operator /= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_div_pd(v[0], d); v[1] = _mm_div_pd(v[1], d); return *this; } - inline sse_meta_d4 operator - () const { return sse_meta_d4(_mm_sub_pd(xmmd_0, v[0]), _mm_sub_pd(xmmd_0, v[1])); } + inline sse_meta_d4 operator - () const + { + static const union { int64_t i[2]; __m128d m; } negativemask __attribute__ ((aligned(16))) = {{0x8000000000000000, 0x8000000000000000}}; + return sse_meta_d4(_mm_xor_pd(v[0], negativemask.m), _mm_xor_pd(v[1], negativemask.m)); + } } __attribute__ ((aligned (16))); -static inline sse_meta_f4 cross_prod(const sse_meta_f4& x, const sse_meta_f4& y) + +static inline __m128 cross_prod(__m128 x, __m128 y) { // set to a[1][2][0][3] , b[2][0][1][3] // multiply static const int s1 = _MM_SHUFFLE(3, 0, 2, 1); static const int s2 = _MM_SHUFFLE(3, 1, 0, 2); - __m128 xa = _mm_mul_ps(_mm_shuffle_ps(x.v, x.v, s1), _mm_shuffle_ps(y.v, y.v, s2)); + __m128 xa = _mm_mul_ps(_mm_shuffle_ps(x, x, s1), _mm_shuffle_ps(y, y, s2)); // set to a[2][0][1][3] , b[1][2][0][3] // multiply - __m128 xb = _mm_mul_ps(_mm_shuffle_ps(x.v, x.v, s2), _mm_shuffle_ps(y.v, y.v, s1)); + __m128 xb = _mm_mul_ps(_mm_shuffle_ps(x, x, s2), _mm_shuffle_ps(y, y, s1)); // subtract - return sse_meta_f4(_mm_sub_ps(xa, xb)); + return _mm_sub_ps(xa, xb); } -static inline sse_meta_d4 cross_prod(const sse_meta_d4& x, const sse_meta_d4& y) +static inline sse_meta_f4 cross_prod(const sse_meta_f4& x, const sse_meta_f4& y) +{ + return sse_meta_f4(cross_prod(x.v, y.v)); +} + +static inline void cross_prod(__m128d x0, __m128d x1, __m128d y0, __m128d y1, __m128d* z0, __m128d* z1) { static const int s0 = _MM_SHUFFLE2(0, 0); static const int s1 = _MM_SHUFFLE2(0, 1); static const int s2 = _MM_SHUFFLE2(1, 0); static const int s3 = _MM_SHUFFLE2(1, 1); - __m128d xa1 = _mm_mul_pd(_mm_shuffle_pd(x.v[0], x.v[1], s1), _mm_shuffle_pd(y.v[1], y.v[0], s0)); - __m128d ya1 = _mm_mul_pd(_mm_shuffle_pd(x.v[0], x.v[1], s2), _mm_shuffle_pd(y.v[0], y.v[1], s3)); + __m128d xa1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s1), _mm_shuffle_pd(y1, y0, s0)); + __m128d ya1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s2), _mm_shuffle_pd(y0, y1, s3)); - __m128d xa2 = _mm_mul_pd(_mm_shuffle_pd(x.v[1], x.v[0], s0), _mm_shuffle_pd(y.v[0], y.v[1], s1)); - __m128d ya2 = _mm_mul_pd(_mm_shuffle_pd(x.v[0], y.v[1], s3), _mm_shuffle_pd(y.v[0], y.v[1], s2)); + __m128d xa2 = _mm_mul_pd(_mm_shuffle_pd(x1, x0, s0), _mm_shuffle_pd(y0, y1, s1)); + __m128d ya2 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s3), _mm_shuffle_pd(y0, y1, s2)); + + *z0 = _mm_sub_pd(xa1, xa2); + *z1 = _mm_sub_pd(ya1, ya2); +} + +static inline sse_meta_d4 cross_prod(const sse_meta_d4& x, const sse_meta_d4& y) +{ + __m128d z0, z1; + cross_prod(x.v[0], x.v[1], y.v[0], y.v[1], &z0, &z1); + return sse_meta_d4(z0, z1); +} + - return sse_meta_d4(_mm_sub_pd(xa1, xa2), _mm_sub_pd(ya1, ya2)); +static inline __m128 dot_prod3(__m128 x, __m128 y) +{ + register __m128 m = _mm_mul_ps(x, y); + return _mm_add_ps(_mm_shuffle_ps(m, m, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2))); } -static inline float dot_prod(const sse_meta_f4& x, const sse_meta_f4& y) +static inline float dot_prod3(const sse_meta_f4& x, const sse_meta_f4& y) +{ + return _mm_cvtss_f32(dot_prod3(x.v, y.v)); +} + + +static inline __m128d dot_prod3(__m128d x0, __m128d x1, __m128d y0, __m128d y1) +{ + register __m128d m1 = _mm_mul_pd(x0, y0); + register __m128d m2 = _mm_mul_pd(x1, y1); + return _mm_add_pd(_mm_add_pd(vec_splat(m1, 0), vec_splat(m1, 1)), vec_splat(m2, 0)); +} + +static inline double dot_prod3(const sse_meta_d4& x, const sse_meta_d4& y) +{ + double d; + _mm_storel_pd(&d, dot_prod3(x.v[0], x.v[1], y.v[0], y.v[1])); + return d; +} + +static inline __m128 dot_prod4(__m128 x, __m128 y) { #if defined (__SSE4__) - return _mm_cvtss_f32(_mm_dp_ps(x.v, y.v, 0x71)); + return _mm_dp_ps(x, y, 0x71); #elif defined (__SSE3__) - register __m128 t = _mm_mul_ps(x.v, y.v); - t = _mm_hadd_ps(t, t); + register __m128 t = _mm_mul_ps(x, y); t = _mm_hadd_ps(t, t); - return _mm_cvtss_f32(t); + return _mm_hadd_ps(t, t); #else - register __m128 s = _mm_mul_ps(x.v, y.v); + register __m128 s = _mm_mul_ps(x, y); register __m128 r = _mm_add_ss(s, _mm_movehl_ps(s, s)); - r = _mm_add_ss(r, _mm_shuffle_ps(r, r, 1)); - return _mm_cvtss_f32(r); + return _mm_add_ss(r, _mm_shuffle_ps(r, r, 1)); #endif } -static inline double dot_prod(const sse_meta_d4& x, const sse_meta_d4& y) + +static inline float dot_prod4(const sse_meta_f4& x, const sse_meta_f4& y) +{ + return _mm_cvtss_f32(dot_prod4(x.v, y.v)); +} + +static inline __m128d dot_prod4(__m128d x0, __m128d x1, __m128d y0, __m128d y1) { #if defined (__SSE4__) - register __m128d t1 = _mm_dp_pd(x.v[0], y.v[0], 0x31); - register __m128d t2 = _mm_dp_pd(x.v[1], y.v[1], 0x11); - t1 = _mm_add_pd(t1, t2); - double d; - _mm_storel_pd(&d, t1); - return d; + register __m128d t1 = _mm_dp_pd(x0, y0, 0x31); + register __m128d t2 = _mm_dp_pd(x1, y1, 0x11); + return _mm_add_pd(t1, t2); #elif defined (__SSE3__) - double d; - register __m128d t1 = _mm_mul_pd(x.v[0], y.v[0]); - register __m128d t2 = _mm_mul_pd(x.v[1], y.v[1]); + register __m128d t1 = _mm_mul_pd(x0, y0); + register __m128d t2 = _mm_mul_pd(x1, y1); t1 = _mm_hadd_pd(t1, t1); t2 = _mm_hadd_pd(t2, t2); - t1 = _mm_add_pd(t1, t2); - _mm_storeh_pl(&d, t1); - return d; + return _mm_add_pd(t1, t2); #else - double d; - register __m128d t1 = _mm_mul_pd(x.v[0], y.v[0]); - register __m128d t2 = _mm_mul_pd(x.v[1], y.v[1]); + register __m128d t1 = _mm_mul_pd(x0, y0); + register __m128d t2 = _mm_mul_pd(x1, y1); t1 = _mm_add_pd(t1, t2); - t1 = _mm_add_pd(t1, _mm_shuffle_pd(t1, t1, 1)); - _mm_storel_pd(&d, t1); - return d; + return _mm_add_pd(t1, _mm_shuffle_pd(t1, t1, 1)); #endif } +static inline double dot_prod4(const sse_meta_d4& x, const sse_meta_d4& y) +{ + double d; + _mm_storel_pd(&d, dot_prod4(x.v[0], x.v[1], y.v[0], y.v[1])); + return d; +} + static inline sse_meta_f4 min(const sse_meta_f4& x, const sse_meta_f4& y) { return sse_meta_f4(_mm_min_ps(x.v, y.v)); @@ -284,21 +370,18 @@ static inline sse_meta_d4 max(const sse_meta_d4& x, const sse_meta_d4& y) static inline sse_meta_f4 abs(const sse_meta_f4& x) { - static const union { int i[4]; __m128 m; } abs4mask = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; + static const union { int i[4]; __m128 m; } abs4mask __attribute__ ((aligned (16))) = {{0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}}; return sse_meta_f4(_mm_and_ps(x.v, abs4mask.m)); } static inline sse_meta_d4 abs(const sse_meta_d4& x) { - static const union { int64_t i[2]; __m128d m; } abs2mask = {0x7fffffffffffffff, 0x7fffffffffffffff}; + static const union { int64_t i[2]; __m128d m; } abs2mask __attribute__ ((aligned (16))) = {{0x7fffffffffffffff, 0x7fffffffffffffff}}; return sse_meta_d4(_mm_and_pd(x.v[0], abs2mask.m), _mm_and_pd(x.v[1], abs2mask.m)); } static inline bool equal(const sse_meta_f4& x, const sse_meta_f4& y, float epsilon) { - epsilon = 1e-3; - //static const union { int i[4]; __m128 m; } abs4mask = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; - //return (_mm_movemask_ps(_mm_cmplt_ps(_mm_and_ps(_mm_sub_ps(x.v, y.v), abs4mask.m), _mm_set1_ps(epsilon))) == 0x0f); register __m128 d = _mm_sub_ps(x.v, y.v); register __m128 e = _mm_set1_ps(epsilon); return ((_mm_movemask_ps(_mm_cmplt_ps(d, e)) & 0x7) == 0x7) && ((_mm_movemask_ps(_mm_cmpgt_ps(d, _mm_sub_ps(xmms_0, e))) & 0x7) == 0x7); @@ -306,9 +389,6 @@ static inline bool equal(const sse_meta_f4& x, const sse_meta_f4& y, float epsil static inline bool equal(const sse_meta_d4& x, const sse_meta_d4& y, double epsilon) { - // static const union { int64 i[2]; __m128d m; } abs2mask = {0x7fffffffffffffff, 0x7fffffffffffffff}; - // return (_mm_movemask_pd(_mm_cmplt_pd(_mm_and_pd(_mm_sub_pd(x.v[0], y.v[0]), abs2mask.m), _mm_set1_pd(epsilon))) == 0x3) && (_mm_movemask_pd(_mm_cmplt_pd(_mm_and_pd(_mm_sub_pd(x.v[1], y.v[1]), abs2mask.m), _mm_set1_pd(epsilon))) == 0x3); - register __m128d d = _mm_sub_pd(x.v[0], y.v[0]); register __m128d e = _mm_set1_pd(epsilon); @@ -321,62 +401,220 @@ static inline bool equal(const sse_meta_d4& x, const sse_meta_d4& y, double epsi return true; } +static inline sse_meta_f4 normalize3(const sse_meta_f4& x) +{ + register __m128 m = _mm_mul_ps(x.v, x.v); + __m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2))); + return sse_meta_f4(_mm_mul_ps(x.v, newtonraphson_rsqrt4(r))); +} +static inline sse_meta_f4 normalize3_approx(const sse_meta_f4& x) +{ + register __m128 m = _mm_mul_ps(x.v, x.v); + __m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2))); + return sse_meta_f4(_mm_mul_ps(x.v, _mm_rsqrt_ps(r))); +} -struct sse_meta_f16 + +struct sse_meta_f12 { typedef float meta_type; + sse_meta_f4 c[3]; - sse_meta_f4 c[4]; - - sse_meta_f16(float xx, float xy, float xz, + sse_meta_f12(float xx, float xy, float xz, float yx, float yy, float yz, float zx, float zy, float zz) + { setValue(xx, xy, xz, yz, yy, yz, zx, zy, zz); } + + sse_meta_f12(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z) + { setColumn(x, y, z); } + + sse_meta_f12(__m128 x, __m128 y, __m128 z) + { setColumn(x, y, z); } + + inline void setValue(float xx, float xy, float xz, + float yx, float yy, float yz, + float zx, float zy, float zz) { - setValue(xx, xy, xz, yz, yy, yz, zx, zy, zz); + c[0].setValue(xx, yx, zx, 0); + c[1].setValue(xy, yy, zy, 0); + c[2].setValue(xz, yz, zz, 0); } - sse_meta_f16(float xx, float xy, float xz, float xw, - float yx, float yy, float yz, float yw, - float zx, float zy, float zz, float zw, - float wx, float wy, float wz, float ww) + inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z) { - setValue(xx, xy, xz, xw, - yz, yy, yz, yw, - zx, zy, zz, zw, - wx, wy, wz, ww); + c[0] = x; c[1] = y; c[2] = z; } - sse_meta_f16(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z) + inline void setColumn(__m128 x, __m128 y, __m128 z) { - setColumn(x, y, z); + c[0].setValue(x); c[1].setValue(y); c[2].setValue(z); } - sse_meta_f16(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w) + inline const sse_meta_f4& getColumn(size_t i) const { - setColumn(x, y, z, w); + return c[i]; } - sse_meta_f16(__m128 x, __m128 y, __m128 z) + inline sse_meta_f4& getColumn(size_t i) { - setColumn(x, y, z); + return c[i]; } - sse_meta_f16(__m128 x, __m128 y, __m128 z, __m128 w) + inline sse_meta_f4 getRow(size_t i) const { - setColumn(x, y, z, w); + return sse_meta_f4(c[0][i], c[1][i], c[2][i], 0); } - inline void setValue(float xx, float xy, float xz, - float yx, float yy, float yz, - float zx, float zy, float zz) + inline float operator () (size_t i, size_t j) const { - c[0].setValue(xx, yx, zx, 0); - c[1].setValue(xy, yy, zy, 0); - c[2].setValue(xz, yz, zz, 0); - c[3].setValue(0, 0, 0, 1); + return c[j][i]; + } + + inline float& operator () (size_t i, size_t j) + { + return c[j][i]; + } + + inline sse_meta_f4 operator * (const sse_meta_f4& v) const + { + return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))), _mm_mul_ps(c[2].v, vec_splat(v.v, 2)))); + } + + inline sse_meta_f12 operator * (const sse_meta_f12& mat) const + { + return sse_meta_f12((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2]); + } + + inline sse_meta_f12 operator + (float t_) const + { + sse_meta_f4 t(t_); + return sse_meta_f12(c[0] + t, c[1] + t, c[2] + t); + } + + inline sse_meta_f12 operator - (float t_) const + { + sse_meta_f4 t(t_); + return sse_meta_f12(c[0] - t, c[1] - t, c[2] - t); + } + + inline sse_meta_f12 operator * (float t_) const + { + sse_meta_f4 t(t_); + return sse_meta_f12(c[0] * t, c[1] * t, c[2] * t); + } + + inline sse_meta_f12 operator / (float t_) const + { + sse_meta_f4 t(t_); + return sse_meta_f12(c[0] / t, c[1] / t, c[2] / t); + } + + + inline sse_meta_f12& operator += (float t_) + { + sse_meta_f4 t(t_); + c[0] += t; + c[1] += t; + c[2] += t; + return *this; } + inline sse_meta_f12& operator -= (float t_) + { + sse_meta_f4 t(t_); + c[0] -= t; + c[1] -= t; + c[2] -= t; + return *this; + } + + inline sse_meta_f12& operator *= (float t_) + { + sse_meta_f4 t(t_); + c[0] *= t; + c[1] *= t; + c[2] *= t; + return *this; + } + + inline sse_meta_f12& operator /= (float t_) + { + sse_meta_f4 t(t_); + c[0] /= t; + c[1] /= t; + c[2] /= t; + return *this; + } + +}; + + +static inline void transpose(__m128 c0, __m128 c1, __m128 c2, __m128* r0, __m128* r1, __m128* r2) +{ + static const union { int i[4]; __m128 m; } selectmask __attribute__ ((aligned(16))) = {{0, 0xffffffff, 0, 0}}; + register __m128 t0, t1; + t0 = _mm_unpackhi_ps(c0, c2); + t1 = _mm_unpacklo_ps(c0, c2); + *r0 = _mm_unpackhi_ps(t0, c1); + *r1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 3, 2, 2)); + *r1 = vec_sel(*r1, c1, selectmask.m); + *r2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 1, 1, 0)); + *r2 = vec_sel(*r2, vec_splat(c1, 1), selectmask.m); +} + +static inline sse_meta_f12 transpose(const sse_meta_f12& mat) +{ + __m128 r0, r1, r2; + transpose(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &r0, &r1, &r2); + return sse_meta_f12(r0, r1, r2); +} + +static inline void inverse(__m128 c0, __m128 c1, __m128 c2, __m128* i0, __m128* i1, __m128* i2) +{ + __m128 t0, t1, t2, d, invd; + t2 = cross_prod(c0, c1); + t0 = cross_prod(c1, c2); + t1 = cross_prod(c2, c0); + d = dot_prod3(t2, c2); + d = _mm_shuffle_ps(d, d, _MM_SHUFFLE(0, 0, 0, 0)); + invd = _mm_rcp_ps(d); // approximate inverse + transpose(t0, t1, t2, i0, i1, i2); + *i0 = _mm_mul_ps(*i0, invd); + *i1 = _mm_mul_ps(*i1, invd); + *i2 = _mm_mul_ps(*i2, invd); +} + +static inline sse_meta_f12 inverse(const sse_meta_f12& mat) +{ + __m128 inv0, inv1, inv2; + inverse(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &inv0, &inv1, &inv2); + return sse_meta_f12(inv0, inv1, inv2); +} + +static inline float determinant(const sse_meta_f12& mat) +{ + return _mm_cvtss_f32(dot_prod3(mat.getColumn(2).v, cross_prod(mat.getColumn(0).v, mat.getColumn(1).v))); +} + + +struct sse_meta_f16 +{ + typedef float meta_type; + sse_meta_f4 c[4]; + + sse_meta_f16(float xx, float xy, float xz, float xw, + float yx, float yy, float yz, float yw, + float zx, float zy, float zz, float zw, + float wx, float wy, float wz, float ww) + { setValue(xx, xy, xz, xw, yz, yy, yz, yw, zx, zy, zz, zw, wx, wy, wz, ww); } + + sse_meta_f16(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w) + { setColumn(x, y, z, w); } + + sse_meta_f16(__m128 x, __m128 y, __m128 z, __m128 w) + { setColumn(x, y, z, w); } + inline void setValue(float xx, float xy, float xz, float xw, float yx, float yy, float yz, float yw, float zx, float zy, float zz, float zw, @@ -388,21 +626,11 @@ struct sse_meta_f16 c[3].setValue(xw, yw, zw, ww); } - inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z) - { - c[0] = x; c[1] = y; c[2] = z; c[3].setValue(0, 0, 0, 1); - } - inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w) { c[0] = x; c[1] = y; c[2] = z; c[3] = w; } - inline void setColumn(__m128 x, __m128 y, __m128 z) - { - c[0].setValue(x); c[1].setValue(y); c[2].setValue(z); c[3].setValue(0, 0, 0, 1); - } - inline void setColumn(__m128 x, __m128 y, __m128 z, __m128 w) { c[0].setValue(x); c[1].setValue(y); c[2].setValue(z); c[3].setValue(w); @@ -433,12 +661,10 @@ struct sse_meta_f16 return c[j][i]; } - - inline sse_meta_f4 operator * (const sse_meta_f4& v) const { - return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, _mm_shuffle_ps(v.v, v.v, _MM_SHUFFLE(0, 0, 0, 0))), _mm_mul_ps(c[1].v, _mm_shuffle_ps(v.v, v.v, _MM_SHUFFLE(1, 1, 1, 1)))), - _mm_add_ps(_mm_mul_ps(c[2].v, _mm_shuffle_ps(v.v, v.v, _MM_SHUFFLE(2, 2, 2, 2))), _mm_mul_ps(c[3].v, _mm_shuffle_ps(v.v, v.v, _MM_SHUFFLE(3, 3, 3, 3)))) + return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))), + _mm_add_ps(_mm_mul_ps(c[2].v, vec_splat(v.v, 2)), _mm_mul_ps(c[3].v, vec_splat(v.v, 3))) )); } @@ -447,7 +673,70 @@ struct sse_meta_f16 return sse_meta_f16((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2], (*this) * mat.c[3]); } + inline sse_meta_f16 operator + (float t_) const + { + sse_meta_f4 t(t_); + return sse_meta_f16(c[0] + t, c[1] + t, c[2] + t, c[3] + t); + } + inline sse_meta_f16 operator - (float t_) const + { + sse_meta_f4 t(t_); + return sse_meta_f16(c[0] - t, c[1] - t, c[2] - t, c[3] - t); + } + + inline sse_meta_f16 operator * (float t_) const + { + sse_meta_f4 t(t_); + return sse_meta_f16(c[0] * t, c[1] * t, c[2] * t, c[3] * t); + } + + inline sse_meta_f16 operator / (float t_) const + { + sse_meta_f4 t(t_); + return sse_meta_f16(c[0] / t, c[1] / t, c[2] / t, c[3] / t); + } + + + inline sse_meta_f16& operator += (float t_) + { + sse_meta_f4 t(t_); + c[0] += t; + c[1] += t; + c[2] += t; + c[3] += t; + return *this; + } + + inline sse_meta_f16& operator -= (float t_) + { + sse_meta_f4 t(t_); + c[0] -= t; + c[1] -= t; + c[2] -= t; + c[3] -= t; + return *this; + } + + inline sse_meta_f16& operator *= (float t_) + { + sse_meta_f4 t(t_); + c[0] *= t; + c[1] *= t; + c[2] *= t; + c[3] *= t; + return *this; + } + + inline sse_meta_f16& operator /= (float t_) + { + sse_meta_f4 t(t_); + c[0] /= t; + c[1] /= t; + c[2] /= t; + c[3] /= t; + return *this; + } }; @@ -460,6 +749,98 @@ sse_meta_f16 transpose(const sse_meta_f16& mat) __m128 tmp3 = _mm_unpacklo_ps(mat.getColumn(1).v, mat.getColumn(3).v); return sse_meta_f16(_mm_unpackhi_ps(tmp0, tmp1), _mm_unpacklo_ps(tmp0, tmp1), _mm_unpackhi_ps(tmp2, tmp3), _mm_unpacklo_ps(tmp2, tmp3)); } + + +sse_meta_f16 inverse(const sse_meta_f16& mat) +{ + __m128 Va, Vb, Vc; + __m128 r1, r2, r3, tt, tt2; + __m128 sum, Det, RDet; + __m128 trns0, trns1, trns2, trns3; + + __m128 _L1 = mat.getColumn(0).v; + __m128 _L2 = mat.getColumn(1).v; + __m128 _L3 = mat.getColumn(2).v; + __m128 _L4 = mat.getColumn(3).v; + // Calculating the minterms for the first line. + + tt = _L4; tt2 = _mm_ror_ps(_L3,1); + Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0)); // V3'\B7V4 + Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2)); // V3'\B7V4" + Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3)); // V3'\B7V4^ + + r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V3"\B7V4^ - V3^\B7V4" + r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V3^\B7V4' - V3'\B7V4^ + r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); // V3'\B7V4" - V3"\B7V4' + + tt = _L2; + Va = _mm_ror_ps(tt,1); sum = _mm_mul_ps(Va,r1); + Vb = _mm_ror_ps(tt,2); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2)); + Vc = _mm_ror_ps(tt,3); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3)); + + // Calculating the determinant. + Det = _mm_mul_ps(sum,_L1); + Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det)); + + static const union { int i[4]; __m128 m; } Sign_PNPN __attribute__ ((aligned(16))) = {{0x00000000, 0x80000000, 0x00000000, 0x80000000}}; + static const union { int i[4]; __m128 m; } Sign_NPNP __attribute__ ((aligned(16))) = {{0x80000000, 0x00000000, 0x80000000, 0x00000000}}; + static const union { float i[4]; __m128 m; } ZERONE __attribute__ ((aligned(16))) = {{1.0f, 0.0f, 0.0f, 1.0f}}; + + __m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN.m); + + // Calculating the minterms of the second line (using previous results). + tt = _mm_ror_ps(_L1,1); sum = _mm_mul_ps(tt,r1); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2)); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3)); + __m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP.m); + + // Testing the determinant. + Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1)); + + // Calculating the minterms of the third line. + tt = _mm_ror_ps(_L1,1); + Va = _mm_mul_ps(tt,Vb); // V1'\B7V2" + Vb = _mm_mul_ps(tt,Vc); // V1'\B7V2^ + Vc = _mm_mul_ps(tt,_L2); // V1'\B7V2 + + r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V1"\B7V2^ - V1^\B7V2" + r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V1^\B7V2' - V1'\B7V2^ + r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); // V1'\B7V2" - V1"\B7V2' + + tt = _mm_ror_ps(_L4,1); sum = _mm_mul_ps(tt,r1); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2)); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3)); + __m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN.m); + + // Dividing is FASTER than rcp_nr! (Because rcp_nr causes many register-memory RWs). + RDet = _mm_div_ss(ZERONE.m, Det); // TODO: just 1.0f? + RDet = _mm_shuffle_ps(RDet,RDet,0x00); + + // Devide the first 12 minterms with the determinant. + mtL1 = _mm_mul_ps(mtL1, RDet); + mtL2 = _mm_mul_ps(mtL2, RDet); + mtL3 = _mm_mul_ps(mtL3, RDet); + + // Calculate the minterms of the forth line and devide by the determinant. + tt = _mm_ror_ps(_L3,1); sum = _mm_mul_ps(tt,r1); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2)); + tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3)); + __m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP.m); + mtL4 = _mm_mul_ps(mtL4, RDet); + + // Now we just have to transpose the minterms matrix. + trns0 = _mm_unpacklo_ps(mtL1,mtL2); + trns1 = _mm_unpacklo_ps(mtL3,mtL4); + trns2 = _mm_unpackhi_ps(mtL1,mtL2); + trns3 = _mm_unpackhi_ps(mtL3,mtL4); + _L1 = _mm_movelh_ps(trns0,trns1); + _L2 = _mm_movehl_ps(trns1,trns0); + _L3 = _mm_movelh_ps(trns2,trns3); + _L4 = _mm_movehl_ps(trns3,trns2); + + return sse_meta_f16(_L1, _L2, _L3, _L4); +} + diff --git a/trunk/fcl/include/fcl/vec_3f.h b/trunk/fcl/include/fcl/vec_3f.h index 1781814c..7360e5f9 100644 --- a/trunk/fcl/include/fcl/vec_3f.h +++ b/trunk/fcl/include/fcl/vec_3f.h @@ -80,10 +80,10 @@ public: inline Vec3fX& operator /= (U t) { data /= t; return *this; } inline Vec3fX operator - () { return Vec3fX(-data); } inline Vec3fX cross(const Vec3fX& other) const { return Vec3fX(details::cross_prod(data, other.data)); } - inline U dot(const Vec3fX& other) const { return details::dot_prod(data, other.data); } + inline U dot(const Vec3fX& other) const { return details::dot_prod3(data, other.data); } inline bool normalize() { - U sqr_length = details::dot_prod(data, data); + U sqr_length = details::dot_prod3(data, data); if(sqr_length > 0) { *this /= (U)sqrt(sqr_length); @@ -95,15 +95,15 @@ public: inline Vec3fX normalized() const { - U sqr_length = details::dot_prod(data, data); + U sqr_length = details::dot_prod3(data, data); if(sqr_length > 0) return *this / (U)sqrt(sqr_length); else return *this; } - inline U length() const { return sqrt(details::dot_prod(data, data)); } - inline U sqrLength() const { return details::dot_prod(data, data); } + inline U length() const { return sqrt(details::dot_prod3(data, data)); } + inline U sqrLength() const { return details::dot_prod3(data, data); } inline void setValue(U x, U y, U z) { data.setValue(x, y, z); } inline void setValue(U x) { data.setValue(x); } inline bool equal(const Vec3fX& other, U epsilon = std::numeric_limits<U>::epsilon() * 100) const { return details::equal(data, other.data, epsilon); } -- GitLab