From e68ded92af72653b87a09c95cd271cd66d799f62 Mon Sep 17 00:00:00 2001
From: jpan <jpan@253336fb-580f-4252-a368-f3cef5a2a82b>
Date: Mon, 11 Jun 2012 23:10:30 +0000
Subject: [PATCH] git-svn-id: https://kforge.ros.org/fcl/fcl_ros@97
 253336fb-580f-4252-a368-f3cef5a2a82b

---
 trunk/fcl/include/fcl/math_details.h      |   2 +-
 trunk/fcl/include/fcl/math_simd_details.h | 559 ++++++++++++++++++----
 trunk/fcl/include/fcl/vec_3f.h            |  10 +-
 3 files changed, 476 insertions(+), 95 deletions(-)

diff --git a/trunk/fcl/include/fcl/math_details.h b/trunk/fcl/include/fcl/math_details.h
index b5b8a3aa..ae29c78c 100644
--- a/trunk/fcl/include/fcl/math_details.h
+++ b/trunk/fcl/include/fcl/math_details.h
@@ -116,7 +116,7 @@ static inline Vec3Data<T> cross_prod(const Vec3Data<T>& l, const Vec3Data<T>& r)
 }
 
 template <typename T>
-static inline T dot_prod(const Vec3Data<T>& l, const Vec3Data<T>& r)
+static inline T dot_prod3(const Vec3Data<T>& l, const Vec3Data<T>& r)
 {
   return l.vs[0] * r.vs[0] + l.vs[1] * r.vs[1] + l.vs[2] * r.vs[2];
 }
diff --git a/trunk/fcl/include/fcl/math_simd_details.h b/trunk/fcl/include/fcl/math_simd_details.h
index c2cdc3d7..d71196d4 100644
--- a/trunk/fcl/include/fcl/math_simd_details.h
+++ b/trunk/fcl/include/fcl/math_simd_details.h
@@ -57,6 +57,40 @@ namespace details
 const __m128  xmms_0 = {0.f, 0.f, 0.f, 0.f};
 const __m128d xmmd_0 = {0, 0};
 
+static inline __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
+{
+  return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+}
+
+static inline __m128 vec_splat(__m128 a, int e)
+{
+  return _mm_shuffle_ps(a, a, _MM_SHUFFLE(e, e, e, e));
+}
+
+static inline __m128d vec_splat(__m128d a, int e)
+{
+  return _mm_shuffle_pd(a, a, _MM_SHUFFLE2(e, e));
+}
+
+static inline __m128 _mm_ror_ps(__m128 x, int e)
+{
+  return (e % 4) ? _mm_shuffle_ps(x, x, _MM_SHUFFLE((e+3)%4, (e+2)%4, (e+1)%4, e%4)) : x;
+}
+
+static inline __m128 _mm_rol_ps(__m128 x, int e)
+{
+  return (e % 4) ? _mm_shuffle_ps(x, x, _MM_SHUFFLE((7-e)%4, (6-e)%4, (5-e)%4, (4-e)%4)) : x;
+}
+
+static inline __m128 newtonraphson_rsqrt4(const __m128 v)
+{
+  static const union { float i[4]; __m128 m; } _half4 __attribute__ ((aligned(16))) = {{.5f, .5f, .5f, .5f}};
+  static const union { float i[4]; __m128 m; } _three __attribute__ ((aligned(16))) = {{3.f, 3.f, 3.f, 3.f}};
+  __m128 approx = _mm_rsqrt_ps(v);
+  __m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);
+  return _mm_mul_ps(_mm_mul_ps(_half4.m, approx), _mm_sub_ps(_three.m, muls));
+}
+
 struct sse_meta_f4
 {
   typedef float meta_type;
@@ -93,7 +127,11 @@ struct sse_meta_f4
   inline sse_meta_f4& operator -= (float t) { v = _mm_sub_ps(v, _mm_set1_ps(t)); return *this; }
   inline sse_meta_f4& operator *= (float t) { v = _mm_mul_ps(v, _mm_set1_ps(t)); return *this; }
   inline sse_meta_f4& operator /= (float t) { v = _mm_div_ps(v, _mm_set1_ps(t)); return *this; }
-  inline sse_meta_f4 operator - () const { return sse_meta_f4(_mm_sub_ps(xmms_0, v)); }
+  inline sse_meta_f4 operator - () const 
+  {
+    static const union { int i[4]; __m128 m; } negativemask __attribute__ ((aligned(16))) = {{0x80000000, 0x80000000, 0x80000000, 0x80000000}};
+    return sse_meta_f4(_mm_xor_ps(negativemask.m, v));                     
+  }
 } __attribute__ ((aligned (16)));
 
 struct sse_meta_d4
@@ -181,87 +219,135 @@ struct sse_meta_d4
   inline sse_meta_d4& operator -= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_sub_pd(v[0], d); v[1] = _mm_sub_pd(v[1], d); return *this; }
   inline sse_meta_d4& operator *= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_mul_pd(v[0], d); v[1] = _mm_mul_pd(v[1], d); return *this; }
   inline sse_meta_d4& operator /= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_div_pd(v[0], d); v[1] = _mm_div_pd(v[1], d); return *this; }
-  inline sse_meta_d4 operator - () const { return sse_meta_d4(_mm_sub_pd(xmmd_0, v[0]), _mm_sub_pd(xmmd_0, v[1])); }
+  inline sse_meta_d4 operator - () const 
+  {
+    static const union { int64_t i[2]; __m128d m; } negativemask __attribute__ ((aligned(16))) = {{0x8000000000000000, 0x8000000000000000}};
+    return sse_meta_d4(_mm_xor_pd(v[0], negativemask.m), _mm_xor_pd(v[1], negativemask.m));
+  }
 } __attribute__ ((aligned (16)));
 
 
-static inline sse_meta_f4 cross_prod(const sse_meta_f4& x, const sse_meta_f4& y)
+
+static inline __m128 cross_prod(__m128 x, __m128 y)
 {
   // set to a[1][2][0][3] , b[2][0][1][3]
   // multiply
   static const int s1 = _MM_SHUFFLE(3, 0, 2, 1);
   static const int s2 = _MM_SHUFFLE(3, 1, 0, 2);
-  __m128 xa = _mm_mul_ps(_mm_shuffle_ps(x.v, x.v, s1), _mm_shuffle_ps(y.v, y.v, s2));
+  __m128 xa = _mm_mul_ps(_mm_shuffle_ps(x, x, s1), _mm_shuffle_ps(y, y, s2));
 
   // set to a[2][0][1][3] , b[1][2][0][3]
   // multiply
-  __m128 xb = _mm_mul_ps(_mm_shuffle_ps(x.v, x.v, s2), _mm_shuffle_ps(y.v, y.v, s1));
+  __m128 xb = _mm_mul_ps(_mm_shuffle_ps(x, x, s2), _mm_shuffle_ps(y, y, s1));
 
   // subtract
-  return sse_meta_f4(_mm_sub_ps(xa, xb));
+  return _mm_sub_ps(xa, xb);
 }
 
-static inline sse_meta_d4 cross_prod(const sse_meta_d4& x, const sse_meta_d4& y)
+static inline sse_meta_f4 cross_prod(const sse_meta_f4& x, const sse_meta_f4& y)
+{
+  return sse_meta_f4(cross_prod(x.v, y.v));
+}
+
+static inline void cross_prod(__m128d x0, __m128d x1, __m128d y0, __m128d y1, __m128d* z0, __m128d* z1)
 {
   static const int s0 = _MM_SHUFFLE2(0, 0);
   static const int s1 = _MM_SHUFFLE2(0, 1);
   static const int s2 = _MM_SHUFFLE2(1, 0);
   static const int s3 = _MM_SHUFFLE2(1, 1);  
-  __m128d xa1 = _mm_mul_pd(_mm_shuffle_pd(x.v[0], x.v[1], s1), _mm_shuffle_pd(y.v[1], y.v[0], s0));
-  __m128d ya1 = _mm_mul_pd(_mm_shuffle_pd(x.v[0], x.v[1], s2), _mm_shuffle_pd(y.v[0], y.v[1], s3));
+  __m128d xa1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s1), _mm_shuffle_pd(y1, y0, s0));
+  __m128d ya1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s2), _mm_shuffle_pd(y0, y1, s3));
   
-  __m128d xa2 = _mm_mul_pd(_mm_shuffle_pd(x.v[1], x.v[0], s0), _mm_shuffle_pd(y.v[0], y.v[1], s1));
-  __m128d ya2 = _mm_mul_pd(_mm_shuffle_pd(x.v[0], y.v[1], s3), _mm_shuffle_pd(y.v[0], y.v[1], s2));
+  __m128d xa2 = _mm_mul_pd(_mm_shuffle_pd(x1, x0, s0), _mm_shuffle_pd(y0, y1, s1));
+  __m128d ya2 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s3), _mm_shuffle_pd(y0, y1, s2));
+
+  *z0 = _mm_sub_pd(xa1, xa2);
+  *z1 = _mm_sub_pd(ya1, ya2);
+}
+
+static inline sse_meta_d4 cross_prod(const sse_meta_d4& x, const sse_meta_d4& y)
+{
+  __m128d z0, z1;
+  cross_prod(x.v[0], x.v[1], y.v[0], y.v[1], &z0, &z1);
+  return sse_meta_d4(z0, z1);
+}
+
 
-  return sse_meta_d4(_mm_sub_pd(xa1, xa2), _mm_sub_pd(ya1, ya2));
+static inline __m128 dot_prod3(__m128 x, __m128 y)
+{
+  register __m128 m = _mm_mul_ps(x, y);
+  return _mm_add_ps(_mm_shuffle_ps(m, m, _MM_SHUFFLE(0, 0, 0, 0)),
+                    _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
 }
 
-static inline float dot_prod(const sse_meta_f4& x, const sse_meta_f4& y)
+static inline float dot_prod3(const sse_meta_f4& x, const sse_meta_f4& y)
+{
+  return _mm_cvtss_f32(dot_prod3(x.v, y.v));
+}
+
+
+static inline __m128d dot_prod3(__m128d x0, __m128d x1, __m128d y0, __m128d y1)
+{
+  register __m128d m1 = _mm_mul_pd(x0, y0);
+  register __m128d m2 = _mm_mul_pd(x1, y1);
+  return _mm_add_pd(_mm_add_pd(vec_splat(m1, 0), vec_splat(m1, 1)), vec_splat(m2, 0));
+}
+
+static inline double dot_prod3(const sse_meta_d4& x, const sse_meta_d4& y)
+{
+  double d;
+  _mm_storel_pd(&d, dot_prod3(x.v[0], x.v[1], y.v[0], y.v[1]));
+  return d;
+}
+
+static inline __m128 dot_prod4(__m128 x, __m128 y)
 {
 #if defined (__SSE4__)
-  return _mm_cvtss_f32(_mm_dp_ps(x.v, y.v, 0x71));
+  return _mm_dp_ps(x, y, 0x71);
 #elif defined (__SSE3__)
-  register __m128 t = _mm_mul_ps(x.v, y.v);
-  t = _mm_hadd_ps(t, t);
+  register __m128 t = _mm_mul_ps(x, y);
   t = _mm_hadd_ps(t, t);
-  return _mm_cvtss_f32(t);
+  return _mm_hadd_ps(t, t);
 #else
-  register __m128 s = _mm_mul_ps(x.v, y.v);
+  register __m128 s = _mm_mul_ps(x, y);
   register __m128 r = _mm_add_ss(s, _mm_movehl_ps(s, s));
-  r = _mm_add_ss(r, _mm_shuffle_ps(r, r, 1));
-  return _mm_cvtss_f32(r);
+  return _mm_add_ss(r, _mm_shuffle_ps(r, r, 1));
 #endif
 }
 
-static inline double dot_prod(const sse_meta_d4& x, const sse_meta_d4& y)
+
+static inline float dot_prod4(const sse_meta_f4& x, const sse_meta_f4& y)
+{
+  return _mm_cvtss_f32(dot_prod4(x.v, y.v));
+}
+
+static inline __m128d dot_prod4(__m128d x0, __m128d x1, __m128d y0, __m128d y1)
 {
 #if defined (__SSE4__)
-  register __m128d t1 = _mm_dp_pd(x.v[0], y.v[0], 0x31);
-  register __m128d t2 = _mm_dp_pd(x.v[1], y.v[1], 0x11);
-  t1 = _mm_add_pd(t1, t2);
-  double d;
-  _mm_storel_pd(&d, t1);
-  return d;
+  register __m128d t1 = _mm_dp_pd(x0, y0, 0x31);
+  register __m128d t2 = _mm_dp_pd(x1, y1, 0x11);
+  return _mm_add_pd(t1, t2);
 #elif defined (__SSE3__)
-  double d;
-  register __m128d t1 = _mm_mul_pd(x.v[0], y.v[0]);
-  register __m128d t2 = _mm_mul_pd(x.v[1], y.v[1]);
+  register __m128d t1 = _mm_mul_pd(x0, y0);
+  register __m128d t2 = _mm_mul_pd(x1, y1);
   t1 = _mm_hadd_pd(t1, t1);
   t2 = _mm_hadd_pd(t2, t2);
-  t1 = _mm_add_pd(t1, t2);
-  _mm_storeh_pl(&d, t1);
-  return d;
+  return _mm_add_pd(t1, t2);
 #else 
-  double d;
-  register __m128d t1 = _mm_mul_pd(x.v[0], y.v[0]);
-  register __m128d t2 = _mm_mul_pd(x.v[1], y.v[1]);
+  register __m128d t1 = _mm_mul_pd(x0, y0);
+  register __m128d t2 = _mm_mul_pd(x1, y1);
   t1 = _mm_add_pd(t1, t2);
-  t1 = _mm_add_pd(t1, _mm_shuffle_pd(t1, t1, 1));
-  _mm_storel_pd(&d, t1);
-  return d;
+  return _mm_add_pd(t1, _mm_shuffle_pd(t1, t1, 1));
 #endif
 }
 
+static inline double dot_prod4(const sse_meta_d4& x, const sse_meta_d4& y)
+{
+  double d;
+  _mm_storel_pd(&d, dot_prod4(x.v[0], x.v[1], y.v[0], y.v[1]));
+  return d;
+}
+
 static inline sse_meta_f4 min(const sse_meta_f4& x, const sse_meta_f4& y)
 {
   return sse_meta_f4(_mm_min_ps(x.v, y.v));
@@ -284,21 +370,18 @@ static inline sse_meta_d4 max(const sse_meta_d4& x, const sse_meta_d4& y)
 
 static inline sse_meta_f4 abs(const sse_meta_f4& x)
 {
-  static const union { int i[4]; __m128 m; } abs4mask = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+  static const union { int i[4]; __m128 m; } abs4mask __attribute__ ((aligned (16))) = {{0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}};
   return sse_meta_f4(_mm_and_ps(x.v, abs4mask.m));
 }
 
 static inline sse_meta_d4 abs(const sse_meta_d4& x)
 {
-  static const union { int64_t i[2]; __m128d m; } abs2mask = {0x7fffffffffffffff, 0x7fffffffffffffff};
+  static const union { int64_t i[2]; __m128d m; } abs2mask __attribute__ ((aligned (16))) = {{0x7fffffffffffffff, 0x7fffffffffffffff}};
   return sse_meta_d4(_mm_and_pd(x.v[0], abs2mask.m), _mm_and_pd(x.v[1], abs2mask.m));
 }
 
 static inline bool equal(const sse_meta_f4& x, const sse_meta_f4& y, float epsilon)
 {
-  epsilon = 1e-3;
-  //static const union { int i[4]; __m128 m; } abs4mask = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-  //return (_mm_movemask_ps(_mm_cmplt_ps(_mm_and_ps(_mm_sub_ps(x.v, y.v), abs4mask.m), _mm_set1_ps(epsilon))) == 0x0f);
   register __m128 d = _mm_sub_ps(x.v, y.v);
   register __m128 e = _mm_set1_ps(epsilon);
   return ((_mm_movemask_ps(_mm_cmplt_ps(d, e)) & 0x7) == 0x7) && ((_mm_movemask_ps(_mm_cmpgt_ps(d, _mm_sub_ps(xmms_0, e))) & 0x7) == 0x7);
@@ -306,9 +389,6 @@ static inline bool equal(const sse_meta_f4& x, const sse_meta_f4& y, float epsil
 
 static inline bool equal(const sse_meta_d4& x, const sse_meta_d4& y, double epsilon)
 {
-  // static const union { int64 i[2]; __m128d m; } abs2mask = {0x7fffffffffffffff, 0x7fffffffffffffff};
-  // return (_mm_movemask_pd(_mm_cmplt_pd(_mm_and_pd(_mm_sub_pd(x.v[0], y.v[0]), abs2mask.m), _mm_set1_pd(epsilon))) == 0x3) && (_mm_movemask_pd(_mm_cmplt_pd(_mm_and_pd(_mm_sub_pd(x.v[1], y.v[1]), abs2mask.m), _mm_set1_pd(epsilon))) == 0x3);
-
   register __m128d d = _mm_sub_pd(x.v[0], y.v[0]);
   register __m128d e = _mm_set1_pd(epsilon);
   
@@ -321,62 +401,220 @@ static inline bool equal(const sse_meta_d4& x, const sse_meta_d4& y, double epsi
   return true;
 }
 
+static inline sse_meta_f4 normalize3(const sse_meta_f4& x)
+{
+  register __m128 m = _mm_mul_ps(x.v, x.v);
+  __m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
+  return sse_meta_f4(_mm_mul_ps(x.v, newtonraphson_rsqrt4(r)));
+}
 
+static inline sse_meta_f4 normalize3_approx(const sse_meta_f4& x)
+{
+  register __m128 m = _mm_mul_ps(x.v, x.v);
+  __m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
+  return sse_meta_f4(_mm_mul_ps(x.v, _mm_rsqrt_ps(r)));
+}
 
-struct sse_meta_f16
+
+struct sse_meta_f12
 {
   typedef float meta_type;
+  sse_meta_f4 c[3];
 
-  sse_meta_f4 c[4];
-  
-  sse_meta_f16(float xx, float xy, float xz,
+  sse_meta_f12(float xx, float xy, float xz,
                float yx, float yy, float yz,
                float zx, float zy, float zz)
+  { setValue(xx, xy, xz, yz, yy, yz, zx, zy, zz); }
+
+  sse_meta_f12(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
+  { setColumn(x, y, z); }
+
+  sse_meta_f12(__m128 x, __m128 y, __m128 z)
+  { setColumn(x, y, z); }
+
+  inline void setValue(float xx, float xy, float xz,
+                       float yx, float yy, float yz,
+                       float zx, float zy, float zz)
   {
-    setValue(xx, xy, xz, yz, yy, yz, zx, zy, zz);
+    c[0].setValue(xx, yx, zx, 0);
+    c[1].setValue(xy, yy, zy, 0);
+    c[2].setValue(xz, yz, zz, 0);
   }
 
-  sse_meta_f16(float xx, float xy, float xz, float xw,
-               float yx, float yy, float yz, float yw,
-               float zx, float zy, float zz, float zw,
-               float wx, float wy, float wz, float ww)
+  inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
   {
-    setValue(xx, xy, xz, xw,
-             yz, yy, yz, yw,
-             zx, zy, zz, zw,
-             wx, wy, wz, ww);
+    c[0] = x; c[1] = y; c[2] = z;
   }
 
-  sse_meta_f16(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
+  inline void setColumn(__m128 x, __m128 y, __m128 z)
   {
-    setColumn(x, y, z);
+    c[0].setValue(x); c[1].setValue(y); c[2].setValue(z);
   }
 
-  sse_meta_f16(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w)
+  inline const sse_meta_f4& getColumn(size_t i) const
   {
-    setColumn(x, y, z, w);
+    return c[i];
   }
 
-  sse_meta_f16(__m128 x, __m128 y, __m128 z)
+  inline sse_meta_f4& getColumn(size_t i) 
   {
-    setColumn(x, y, z);
+    return c[i];
   }
 
-  sse_meta_f16(__m128 x, __m128 y, __m128 z, __m128 w)
+  inline sse_meta_f4 getRow(size_t i) const
   {
-    setColumn(x, y, z, w);
+    return sse_meta_f4(c[0][i], c[1][i], c[2][i], 0);
   }
 
-  inline void setValue(float xx, float xy, float xz,
-                       float yx, float yy, float yz,
-                       float zx, float zy, float zz)
+  inline float operator () (size_t i, size_t j) const
   {
-    c[0].setValue(xx, yx, zx, 0);
-    c[1].setValue(xy, yy, zy, 0);
-    c[2].setValue(xz, yz, zz, 0);
-    c[3].setValue(0, 0, 0, 1);
+    return c[j][i];
+  }
+
+  inline float& operator () (size_t i, size_t j) 
+  {
+    return c[j][i];
+  }
+
+  inline sse_meta_f4 operator * (const sse_meta_f4& v) const
+  {
+    return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))), _mm_mul_ps(c[2].v, vec_splat(v.v, 2))));
+  }
+
+  inline sse_meta_f12 operator * (const sse_meta_f12& mat) const
+  {
+    return sse_meta_f12((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2]);
+  }
+
+  inline sse_meta_f12 operator + (float t_) const
+  {
+    sse_meta_f4 t(t_);
+    return sse_meta_f12(c[0] + t, c[1] + t, c[2] + t);
+  }
+
+  inline sse_meta_f12 operator - (float t_) const
+  {
+    sse_meta_f4 t(t_);
+    return sse_meta_f12(c[0] - t, c[1] - t, c[2] - t);
+  }
+
+  inline sse_meta_f12 operator * (float t_) const
+  {
+    sse_meta_f4 t(t_);
+    return sse_meta_f12(c[0] * t, c[1] * t, c[2] * t);
+  }
+
+  inline sse_meta_f12 operator / (float t_) const
+  {
+    sse_meta_f4 t(t_);
+    return sse_meta_f12(c[0] / t, c[1] / t, c[2] / t);
+  }
+
+
+  inline sse_meta_f12& operator += (float t_)
+  {
+    sse_meta_f4 t(t_);
+    c[0] += t;
+    c[1] += t;
+    c[2] += t;
+    return *this;
   }
 
+  inline sse_meta_f12& operator -= (float t_)
+  {
+    sse_meta_f4 t(t_);
+    c[0] -= t;
+    c[1] -= t;
+    c[2] -= t;
+    return *this;
+  }
+
+  inline sse_meta_f12& operator *= (float t_)
+  {
+    sse_meta_f4 t(t_);
+    c[0] *= t;
+    c[1] *= t;
+    c[2] *= t;
+    return *this;
+  }
+
+  inline sse_meta_f12& operator /= (float t_)
+  {
+    sse_meta_f4 t(t_);
+    c[0] /= t;
+    c[1] /= t;
+    c[2] /= t;
+    return *this;
+  }
+
+};
+
+
+static inline void transpose(__m128 c0, __m128 c1, __m128 c2, __m128* r0, __m128* r1, __m128* r2)
+{
+  static const union { int i[4]; __m128 m; } selectmask __attribute__ ((aligned(16))) = {{0, 0xffffffff, 0, 0}};
+  register __m128 t0, t1;
+  t0 = _mm_unpackhi_ps(c0, c2);
+  t1 = _mm_unpacklo_ps(c0, c2);
+  *r0 = _mm_unpackhi_ps(t0, c1);
+  *r1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 3, 2, 2));
+  *r1 = vec_sel(*r1, c1, selectmask.m);
+  *r2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 1, 1, 0));
+  *r2 = vec_sel(*r2, vec_splat(c1, 1), selectmask.m);
+}
+
+static inline sse_meta_f12 transpose(const sse_meta_f12& mat)
+{
+  __m128 r0, r1, r2;
+  transpose(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &r0, &r1, &r2);
+  return sse_meta_f12(r0, r1, r2);
+}
+
+static inline void inverse(__m128 c0, __m128 c1, __m128 c2, __m128* i0, __m128* i1, __m128* i2)
+{
+  __m128 t0, t1, t2, d, invd;
+  t2 = cross_prod(c0, c1);
+  t0 = cross_prod(c1, c2);
+  t1 = cross_prod(c2, c0);
+  d = dot_prod3(t2, c2);
+  d = _mm_shuffle_ps(d, d, _MM_SHUFFLE(0, 0, 0, 0));
+  invd = _mm_rcp_ps(d); // approximate inverse
+  transpose(t0, t1, t2, i0, i1, i2);
+  *i0 = _mm_mul_ps(*i0, invd);
+  *i1 = _mm_mul_ps(*i1, invd);
+  *i2 = _mm_mul_ps(*i2, invd);
+}
+
+static inline sse_meta_f12 inverse(const sse_meta_f12& mat)
+{
+  __m128 inv0, inv1, inv2;
+  inverse(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &inv0, &inv1, &inv2);
+  return sse_meta_f12(inv0, inv1, inv2);
+}
+
+static inline float determinant(const sse_meta_f12& mat)
+{
+  return _mm_cvtss_f32(dot_prod3(mat.getColumn(2).v, cross_prod(mat.getColumn(0).v, mat.getColumn(1).v)));
+}
+
+
+struct sse_meta_f16
+{
+  typedef float meta_type;
+  sse_meta_f4 c[4];
+  
+  sse_meta_f16(float xx, float xy, float xz, float xw,
+               float yx, float yy, float yz, float yw,
+               float zx, float zy, float zz, float zw,
+               float wx, float wy, float wz, float ww)
+  { setValue(xx, xy, xz, xw, yz, yy, yz, yw, zx, zy, zz, zw, wx, wy, wz, ww); }
+
+  sse_meta_f16(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w)
+  { setColumn(x, y, z, w); }
+
+  sse_meta_f16(__m128 x, __m128 y, __m128 z, __m128 w)
+  { setColumn(x, y, z, w); }
+
   inline void setValue(float xx, float xy, float xz, float xw,
                        float yx, float yy, float yz, float yw,
                        float zx, float zy, float zz, float zw,
@@ -388,21 +626,11 @@ struct sse_meta_f16
     c[3].setValue(xw, yw, zw, ww);
   }
 
-  inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
-  {
-    c[0] = x; c[1] = y; c[2] = z; c[3].setValue(0, 0, 0, 1);
-  }
-
   inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w)
   {
     c[0] = x; c[1] = y; c[2] = z; c[3] = w;
   }
 
-  inline void setColumn(__m128 x, __m128 y, __m128 z)
-  {
-    c[0].setValue(x); c[1].setValue(y); c[2].setValue(z); c[3].setValue(0, 0, 0, 1);
-  }
-
   inline void setColumn(__m128 x, __m128 y, __m128 z, __m128 w)
   {
     c[0].setValue(x); c[1].setValue(y); c[2].setValue(z); c[3].setValue(w);
@@ -433,12 +661,10 @@ struct sse_meta_f16
     return c[j][i];
   }
 
-  
-
   inline sse_meta_f4 operator * (const sse_meta_f4& v) const
   {
-    return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, _mm_shuffle_ps(v.v, v.v, _MM_SHUFFLE(0, 0, 0, 0))), _mm_mul_ps(c[1].v, _mm_shuffle_ps(v.v, v.v, _MM_SHUFFLE(1, 1, 1, 1)))), 
-                                  _mm_add_ps(_mm_mul_ps(c[2].v, _mm_shuffle_ps(v.v, v.v, _MM_SHUFFLE(2, 2, 2, 2))), _mm_mul_ps(c[3].v, _mm_shuffle_ps(v.v, v.v, _MM_SHUFFLE(3, 3, 3, 3))))
+    return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))), 
+                                  _mm_add_ps(_mm_mul_ps(c[2].v, vec_splat(v.v, 2)), _mm_mul_ps(c[3].v, vec_splat(v.v, 3)))
                                   ));
   }
 
@@ -447,7 +673,70 @@ struct sse_meta_f16
     return sse_meta_f16((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2], (*this) * mat.c[3]);
   }
 
+  inline sse_meta_f16 operator + (float t_) const
+  {
+    sse_meta_f4 t(t_);
+    return sse_meta_f16(c[0] + t, c[1] + t, c[2] + t, c[3] + t);
+  }
 
+  inline sse_meta_f16 operator - (float t_) const
+  {
+    sse_meta_f4 t(t_);
+    return sse_meta_f16(c[0] - t, c[1] - t, c[2] - t, c[3] - t);
+  }
+
+  inline sse_meta_f16 operator * (float t_) const
+  {
+    sse_meta_f4 t(t_);
+    return sse_meta_f16(c[0] * t, c[1] * t, c[2] * t, c[3] * t);
+  }
+
+  inline sse_meta_f16 operator / (float t_) const
+  {
+    sse_meta_f4 t(t_);
+    return sse_meta_f16(c[0] / t, c[1] / t, c[2] / t, c[3] / t);
+  }
+
+
+  inline sse_meta_f16& operator += (float t_)
+  {
+    sse_meta_f4 t(t_);
+    c[0] += t;
+    c[1] += t;
+    c[2] += t;
+    c[3] += t;
+    return *this;
+  }
+
+  inline sse_meta_f16& operator -= (float t_)
+  {
+    sse_meta_f4 t(t_);
+    c[0] -= t;
+    c[1] -= t;
+    c[2] -= t;
+    c[3] -= t;
+    return *this;
+  }
+
+  inline sse_meta_f16& operator *= (float t_)
+  {
+    sse_meta_f4 t(t_);
+    c[0] *= t;
+    c[1] *= t;
+    c[2] *= t;
+    c[3] *= t;
+    return *this;
+  }
+
+  inline sse_meta_f16& operator /= (float t_)
+  {
+    sse_meta_f4 t(t_);
+    c[0] /= t;
+    c[1] /= t;
+    c[2] /= t;
+    c[3] /= t;
+    return *this;
+  }
 
 
 };
@@ -460,6 +749,98 @@ sse_meta_f16 transpose(const sse_meta_f16& mat)
   __m128 tmp3 = _mm_unpacklo_ps(mat.getColumn(1).v, mat.getColumn(3).v);
   return sse_meta_f16(_mm_unpackhi_ps(tmp0, tmp1), _mm_unpacklo_ps(tmp0, tmp1), _mm_unpackhi_ps(tmp2, tmp3), _mm_unpacklo_ps(tmp2, tmp3));
 }
+
+
+sse_meta_f16 inverse(const sse_meta_f16& mat)
+{
+  __m128 Va, Vb, Vc;
+  __m128 r1, r2, r3, tt, tt2;
+  __m128 sum, Det, RDet;
+  __m128 trns0, trns1, trns2, trns3;
+
+  __m128 _L1 = mat.getColumn(0).v;
+  __m128 _L2 = mat.getColumn(1).v;
+  __m128 _L3 = mat.getColumn(2).v;
+  __m128 _L4 = mat.getColumn(3).v;
+  // Calculating the minterms for the first line.
+
+  tt = _L4; tt2 = _mm_ror_ps(_L3,1); 
+  Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0));					// V3'\B7V4
+  Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2));					// V3'\B7V4"
+  Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3));					// V3'\B7V4^
+
+  r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V3"\B7V4^ - V3^\B7V4"
+  r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V3^\B7V4' - V3'\B7V4^
+  r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V3'\B7V4" - V3"\B7V4'
+
+  tt = _L2;
+  Va = _mm_ror_ps(tt,1);		sum = _mm_mul_ps(Va,r1);
+  Vb = _mm_ror_ps(tt,2);		sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
+  Vc = _mm_ror_ps(tt,3);		sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
+
+  // Calculating the determinant.
+  Det = _mm_mul_ps(sum,_L1);
+  Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
+
+  static const union { int i[4]; __m128 m; } Sign_PNPN __attribute__ ((aligned(16))) = {{0x00000000, 0x80000000, 0x00000000, 0x80000000}};
+  static const union { int i[4]; __m128 m; } Sign_NPNP __attribute__ ((aligned(16))) = {{0x80000000, 0x00000000, 0x80000000, 0x00000000}};
+  static const union { float i[4]; __m128 m; } ZERONE __attribute__ ((aligned(16))) = {{1.0f, 0.0f, 0.0f, 1.0f}};
+
+  __m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN.m);
+
+  // Calculating the minterms of the second line (using previous results).
+  tt = _mm_ror_ps(_L1,1);		sum = _mm_mul_ps(tt,r1);
+  tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+  tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+  __m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP.m);
+
+  // Testing the determinant.
+  Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
+
+  // Calculating the minterms of the third line.
+  tt = _mm_ror_ps(_L1,1);
+  Va = _mm_mul_ps(tt,Vb);									// V1'\B7V2"
+  Vb = _mm_mul_ps(tt,Vc);									// V1'\B7V2^
+  Vc = _mm_mul_ps(tt,_L2);								// V1'\B7V2
+
+  r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2));		// V1"\B7V2^ - V1^\B7V2"
+  r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0));		// V1^\B7V2' - V1'\B7V2^
+  r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1));		// V1'\B7V2" - V1"\B7V2'
+
+  tt = _mm_ror_ps(_L4,1);		sum = _mm_mul_ps(tt,r1);
+  tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+  tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+  __m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN.m);
+
+  // Dividing is FASTER than rcp_nr! (Because rcp_nr causes many register-memory RWs).
+  RDet = _mm_div_ss(ZERONE.m, Det); // TODO: just 1.0f?
+  RDet = _mm_shuffle_ps(RDet,RDet,0x00);
+
+  // Devide the first 12 minterms with the determinant.
+  mtL1 = _mm_mul_ps(mtL1, RDet);
+  mtL2 = _mm_mul_ps(mtL2, RDet);
+  mtL3 = _mm_mul_ps(mtL3, RDet);
+
+  // Calculate the minterms of the forth line and devide by the determinant.
+  tt = _mm_ror_ps(_L3,1);		sum = _mm_mul_ps(tt,r1);
+  tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
+  tt = _mm_ror_ps(tt,1);		sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
+  __m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP.m);
+  mtL4 = _mm_mul_ps(mtL4, RDet);
+
+  // Now we just have to transpose the minterms matrix.
+  trns0 = _mm_unpacklo_ps(mtL1,mtL2);
+  trns1 = _mm_unpacklo_ps(mtL3,mtL4);
+  trns2 = _mm_unpackhi_ps(mtL1,mtL2);
+  trns3 = _mm_unpackhi_ps(mtL3,mtL4);
+  _L1 = _mm_movelh_ps(trns0,trns1);
+  _L2 = _mm_movehl_ps(trns1,trns0);
+  _L3 = _mm_movelh_ps(trns2,trns3);
+  _L4 = _mm_movehl_ps(trns3,trns2);
+
+  return sse_meta_f16(_L1, _L2, _L3, _L4);
+}
+
                                 
 
 
diff --git a/trunk/fcl/include/fcl/vec_3f.h b/trunk/fcl/include/fcl/vec_3f.h
index 1781814c..7360e5f9 100644
--- a/trunk/fcl/include/fcl/vec_3f.h
+++ b/trunk/fcl/include/fcl/vec_3f.h
@@ -80,10 +80,10 @@ public:
   inline Vec3fX& operator /= (U t) { data /= t; return *this; }
   inline Vec3fX operator - () { return Vec3fX(-data); }
   inline Vec3fX cross(const Vec3fX& other) const { return Vec3fX(details::cross_prod(data, other.data)); }
-  inline U dot(const Vec3fX& other) const { return details::dot_prod(data, other.data); }
+  inline U dot(const Vec3fX& other) const { return details::dot_prod3(data, other.data); }
   inline bool normalize()
   {
-    U sqr_length = details::dot_prod(data, data);
+    U sqr_length = details::dot_prod3(data, data);
     if(sqr_length > 0)
     {
       *this /= (U)sqrt(sqr_length);
@@ -95,15 +95,15 @@ public:
 
   inline Vec3fX normalized() const
   {
-    U sqr_length = details::dot_prod(data, data);
+    U sqr_length = details::dot_prod3(data, data);
     if(sqr_length > 0)
       return *this / (U)sqrt(sqr_length);
     else
       return *this;
   }
 
-  inline U length() const { return sqrt(details::dot_prod(data, data)); }
-  inline U sqrLength() const { return details::dot_prod(data, data); }
+  inline U length() const { return sqrt(details::dot_prod3(data, data)); }
+  inline U sqrLength() const { return details::dot_prod3(data, data); }
   inline void setValue(U x, U y, U z) { data.setValue(x, y, z); }
   inline void setValue(U x) { data.setValue(x); }
   inline bool equal(const Vec3fX& other, U epsilon = std::numeric_limits<U>::epsilon() * 100) const { return details::equal(data, other.data, epsilon); }
-- 
GitLab