diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 88ae18961a525dfdac3b4727a41c883faf384d17..6dfbafdd8c5a78dc4a6945c549987e2f4cb0b5e9 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -39,7 +39,7 @@ add_fcl_test(test_fcl_box_box_distance test_fcl_box_box_distance.cpp test_fcl_ut
 add_fcl_test(test_fcl_simple test_fcl_simple.cpp)
 add_fcl_test(test_fcl_capsule_box_1 test_fcl_capsule_box_1.cpp test_fcl_utility.cpp)
 add_fcl_test(test_fcl_capsule_box_2 test_fcl_capsule_box_2.cpp test_fcl_utility.cpp)
-#add_fcl_test(test_fcl_obb test_fcl_obb.cpp)
+add_fcl_test(test_fcl_obb test_fcl_obb.cpp)
 
 add_fcl_test(test_fcl_bvh_models test_fcl_bvh_models.cpp test_fcl_utility.cpp)
 
diff --git a/test/test_fcl_obb.cpp b/test/test_fcl_obb.cpp
index 1f24441d3b34233d259d5cc69e4963bfa0565d94..b3ac2740a85fbf83b2e3b5eebda7c40e7974076b 100644
--- a/test/test_fcl_obb.cpp
+++ b/test/test_fcl_obb.cpp
@@ -33,166 +33,1338 @@
  *  POSSIBILITY OF SUCH DAMAGE.
  */
 
-#define BOOST_TEST_MODULE FCL_DISTANCE_OBB
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-#include <boost/utility/binary.hpp>
+#include <fstream>
+#include <sstream>
 
-#define CHECK_CLOSE_TO_0(x, eps) BOOST_CHECK_CLOSE ((x + 1.0), (1.0), (eps))
+#define BOOST_CHRONO_VERSION 2
+#include <boost/chrono/chrono.hpp>
+#include <boost/chrono/chrono_io.hpp>
 
-#include <boost/test/included/unit_test.hpp>
 #include <hpp/fcl/narrowphase/narrowphase.h>
 
 #include "../src/BV/OBB.h"
 #include "../src/distance_func_matrix.h"
 
-using hpp::fcl::FCL_REAL;
+using namespace hpp::fcl;
 
-struct Sample
+int getNumCPUs()
 {
-  hpp::fcl::Matrix3f R;
-  hpp::fcl::Vec3f T;
-  hpp::fcl::Vec3f extent1;
-  hpp::fcl::Vec3f extent2;
-};
+  int NumCPUs = 0;
+  int MaxID = -1;
+  std::ifstream f("/proc/cpuinfo");
+  if (!f.is_open()) {
+    std::cerr << "failed to open /proc/cpuinfo\n";
+    return -1;
+  }
+  const std::string Key = "processor";
+  std::string ln;
+  while (std::getline(f, ln)) {
+    if (ln.empty()) continue;
+    size_t SplitIdx = ln.find(':');
+    std::string value;
+    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+    if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
+      NumCPUs++;
+      if (!value.empty()) {
+        int CurID = (int) strtol(value.c_str(), NULL, 10);
+        MaxID = std::max(CurID, MaxID);
+      }
+    }
+  }
+  if (f.bad()) {
+    std::cerr << "Failure reading /proc/cpuinfo\n";
+    return -1;
+  }
+  if (!f.eof()) {
+    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
+    return -1;
+  }
+  f.close();
+
+  if ((MaxID + 1) != NumCPUs) {
+    fprintf(stderr,
+            "CPU ID assignments in /proc/cpuinfo seem messed up."
+            " This is usually caused by a bad BIOS.\n");
+  }
+  return NumCPUs;
+}
+
+bool checkCpuScalingEnabled()
+{
+  int num_cpus = getNumCPUs ();
+
+  // We don't have a valid CPU count, so don't even bother.
+  if (num_cpus <= 0) return false;
+  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
+  // local file system. If reading the exported files fails, then we may not be
+  // running on Linux, so we silently ignore all the read errors.
+  std::string res;
+  for (int cpu = 0; cpu < num_cpus; ++cpu) {
+    std::ostringstream oss;
+    oss << "/sys/devices/system/cpu/cpu" << cpu << "/cpufreq/scaling_governor";
+    std::string governor_file = oss.str();
+    std::ifstream f(governor_file.c_str());
+    if (!f.is_open()) return false;
+    f >> res;
+    if (!f.good()) return false;
+    if (res != "performance") return true;
+  }
+  return false;
+}
+
+void randomOBBs (
+    Vec3f& a, Vec3f& b, FCL_REAL extentNorm)
+{
+  // Extent norm is between 0 and extentNorm on each axis
+  //a = (Vec3f::Ones()+Vec3f::Random()) * extentNorm / (2*sqrt(3));
+  //b = (Vec3f::Ones()+Vec3f::Random()) * extentNorm / (2*sqrt(3));
+
+  a = extentNorm * Vec3f::Random().cwiseAbs().normalized();
+  b = extentNorm * Vec3f::Random().cwiseAbs().normalized();
+}
+
+void randomTransform (
+    Matrix3f& B, Vec3f& T,
+    const Vec3f& a, const Vec3f& b, const FCL_REAL extentNorm)
+{
+  // TODO Should we scale T to a and b norm ?
+  (void) a;
+  (void) b;
+  (void) extentNorm;
+
+  FCL_REAL N = a.norm() + b.norm();
+  // A translation of norm N ensures there is no collision.
+  // Set translation to be between 0 and 2 * N;
+  T = ( Vec3f::Random() / sqrt(3) ) * 1.5 * N;
+  //T.setZero();
+
+  Quaternion3f q;
+  q.coeffs().setRandom();
+  q.normalize();
+  B = q;
+}
+
+#define NB_METHODS 7
+#define MANUAL_PRODUCT 1
+
+#if MANUAL_PRODUCT
+# define PRODUCT(M33,v3) ( M33.col(0) * v3[0] + M33.col(1) * v3[1] + M33.col(2) * v3[2] )
+#else
+# define PRODUCT(M33,v3) (M33*v3)
+#endif
+
+typedef boost::chrono::high_resolution_clock clock_type;
+typedef clock_type::duration duration_type;
+
+const char* sep = ",\t"; 
+const FCL_REAL eps = 1e-10;
+
+const Eigen::IOFormat py_fmt(Eigen::FullPrecision,
+    0,
+    ", ",       // Coeff separator
+    ",\n",      // Row separator
+    "(",        // row prefix
+    ",)",       // row suffix
+    "( ",       // mat prefix
+    ", )"       // mat suffix
+    );
+
+namespace obbDisjoint_impls
+{
+  /// \return true if OBB are disjoint.
+  bool distance (const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, FCL_REAL& distance)
+  {
+    GJKSolver_indep gjk;
+    Box ba (2*a), bb (2*b);
+    Transform3f tfa, tfb (B, T);
+
+    Vec3f p1, p2, normal;
+    return gjk.shapeDistance (ba, tfa, bb, tfb, distance, p1, p2, normal);
+  }
+
+  inline FCL_REAL _computeDistanceForCase1 (
+      const Vec3f& T, const Vec3f& a, const Vec3f& b, const Matrix3f& Bf)
+  {
+    Vec3f AABB_corner;
+     /* This seems to be slower
+    AABB_corner.noalias() = T.cwiseAbs () - a;
+    AABB_corner.noalias() -= PRODUCT(Bf,b);
+    /*/
+#if MANUAL_PRODUCT
+    AABB_corner.noalias() = T.cwiseAbs () - a;
+    AABB_corner.noalias() -= Bf.col(0) * b[0];
+    AABB_corner.noalias() -= Bf.col(1) * b[1];
+    AABB_corner.noalias() -= Bf.col(2) * b[2];
+#else
+    AABB_corner.noalias() = T.cwiseAbs () - Bf * b - a;
+#endif
+    // */
+    return AABB_corner.array().max(0).matrix().squaredNorm ();
+  }
+
+  inline FCL_REAL _computeDistanceForCase2 (
+      const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, const Matrix3f& Bf)
+  {
+    /*
+    Vec3f AABB_corner(PRODUCT(B.transpose(), T).cwiseAbs() - b);
+    AABB_corner.noalias() -= PRODUCT(Bf.transpose(), a);
+    return AABB_corner.array().max(0).matrix().squaredNorm ();
+    /*/
+#if MANUAL_PRODUCT
+    register FCL_REAL s, t = 0;
+    s = std::abs(B.col(0).dot(T)) - Bf.col(0).dot(a) - b[0];
+    if (s > 0) t += s*s;
+    s = std::abs(B.col(1).dot(T)) - Bf.col(1).dot(a) - b[1];
+    if (s > 0) t += s*s;
+    s = std::abs(B.col(2).dot(T)) - Bf.col(2).dot(a) - b[2];
+    if (s > 0) t += s*s;
+    return t;
+#else
+    Vec3f AABB_corner((B.transpose () * T).cwiseAbs ()  - Bf.transpose () * a - b);
+    return AABB_corner.array().max(0).matrix().squaredNorm ();
+#endif
+    // */
+  }
+
+  int separatingAxisId (const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, const FCL_REAL& breakDistance2, FCL_REAL& squaredLowerBoundDistance)
+  {
+    int id = 0;
+
+    Matrix3f Bf (B.cwiseAbs());
+
+    // Corner of b axis aligned bounding box the closest to the origin
+    squaredLowerBoundDistance = _computeDistanceForCase1 (T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return id;
+    ++id;
+
+    // | B^T T| - b - Bf^T a
+    squaredLowerBoundDistance = _computeDistanceForCase2 (B, T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return id;
+    ++id;
+
+    int ja = 1, ka = 2, jb = 1, kb = 2;
+    for (int ia = 0; ia < 3; ++ia) {
+      for (int ib = 0; ib < 3; ++ib) {
+        const FCL_REAL s = T[ka] * B(ja, ib) - T[ja] * B(ka, ib);
+
+        const FCL_REAL diff = fabs(s) - (a[ja] * Bf(ka, ib) + a[ka] * Bf(ja, ib) +
+            b[jb] * Bf(ia, kb) + b[kb] * Bf(ia, jb));
+        // We need to divide by the norm || Aia x Bib ||
+        // As ||Aia|| = ||Bib|| = 1, (Aia | Bib)^2  = cosine^2
+        if (diff > 0) {
+          FCL_REAL sinus2 = 1 - Bf (ia,ib) * Bf (ia,ib);
+          if (sinus2 > 1e-6) {
+            squaredLowerBoundDistance = diff * diff / sinus2;
+            if (squaredLowerBoundDistance > breakDistance2) {
+              return id;
+            }
+          }
+          /* // or
+             FCL_REAL sinus2 = 1 - Bf (ia,ib) * Bf (ia,ib);
+             squaredLowerBoundDistance = diff * diff;
+             if (squaredLowerBoundDistance > breakDistance2 * sinus2) {
+             squaredLowerBoundDistance /= sinus2;
+             return true;
+             }
+          // */
+        }
+        ++id;
+
+        jb = kb; kb = ib;
+      }
+      ja = ka; ka = ia;
+    }
+
+    return id;
+  }
+
+  // ------------------------ 0 --------------------------------------
+  bool withRuntimeLoop (const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, const FCL_REAL& breakDistance2, FCL_REAL& squaredLowerBoundDistance)
+  {
+    Matrix3f Bf (B.cwiseAbs());
+
+    // Corner of b axis aligned bounding box the closest to the origin
+    squaredLowerBoundDistance = _computeDistanceForCase1 (T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    // | B^T T| - b - Bf^T a
+    squaredLowerBoundDistance = _computeDistanceForCase2 (B, T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    int ja = 1, ka = 2, jb = 1, kb = 2;
+    for (int ia = 0; ia < 3; ++ia) {
+      for (int ib = 0; ib < 3; ++ib) {
+        const FCL_REAL s = T[ka] * B(ja, ib) - T[ja] * B(ka, ib);
+
+        const FCL_REAL diff = fabs(s) - (a[ja] * Bf(ka, ib) + a[ka] * Bf(ja, ib) +
+            b[jb] * Bf(ia, kb) + b[kb] * Bf(ia, jb));
+        // We need to divide by the norm || Aia x Bib ||
+        // As ||Aia|| = ||Bib|| = 1, (Aia | Bib)^2  = cosine^2
+        if (diff > 0) {
+          FCL_REAL sinus2 = 1 - Bf (ia,ib) * Bf (ia,ib);
+          if (sinus2 > 1e-6) {
+            squaredLowerBoundDistance = diff * diff / sinus2;
+            if (squaredLowerBoundDistance > breakDistance2) {
+              return true;
+            }
+          }
+          /* // or
+             FCL_REAL sinus2 = 1 - Bf (ia,ib) * Bf (ia,ib);
+             squaredLowerBoundDistance = diff * diff;
+             if (squaredLowerBoundDistance > breakDistance2 * sinus2) {
+             squaredLowerBoundDistance /= sinus2;
+             return true;
+             }
+          // */
+        }
+
+        jb = kb; kb = ib;
+      }
+      ja = ka; ka = ia;
+    }
+
+    return false;
+
+  }
+
+  // ------------------------ 1 --------------------------------------
+  bool withManualLoopUnrolling_1 (const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, const FCL_REAL& breakDistance2, FCL_REAL& squaredLowerBoundDistance)
+  {
+    FCL_REAL t, s;
+    FCL_REAL diff;
+
+    // Matrix3f Bf = abs(B);
+    // Bf += reps;
+    Matrix3f Bf (B.cwiseAbs());
+
+    // Corner of b axis aligned bounding box the closest to the origin
+    Vec3f AABB_corner (T.cwiseAbs () - Bf * b);
+    Vec3f diff3 (AABB_corner - a);
+    diff3 = diff3.cwiseMax (Vec3f::Zero());
+
+    //for (Vec3f::Index i=0; i<3; ++i) diff3 [i] = std::max (0, diff3 [i]);
+    squaredLowerBoundDistance = diff3.squaredNorm ();
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    AABB_corner = (B.transpose () * T).cwiseAbs ()  - Bf.transpose () * a;
+    // diff3 = | B^T T| - b - Bf^T a
+    diff3 = AABB_corner - b;
+    diff3 = diff3.cwiseMax (Vec3f::Zero());
+    squaredLowerBoundDistance = diff3.squaredNorm ();
+
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    // A0 x B0
+    s = T[2] * B(1, 0) - T[1] * B(2, 0);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    FCL_REAL sinus2;
+    diff = t - (a[1] * Bf(2, 0) + a[2] * Bf(1, 0) +
+        b[1] * Bf(0, 2) + b[2] * Bf(0, 1));
+    // We need to divide by the norm || A0 x B0 ||
+    // As ||A0|| = ||B0|| = 1,
+    //              2            2
+    // || A0 x B0 ||  + (A0 | B0)  = 1
+    if (diff > 0) {
+      sinus2 = 1 - Bf (0,0) * Bf (0,0);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A0 x B1
+    s = T[2] * B(1, 1) - T[1] * B(2, 1);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[1] * Bf(2, 1) + a[2] * Bf(1, 1) +
+        b[0] * Bf(0, 2) + b[2] * Bf(0, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (0,1) * Bf (0,1);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A0 x B2
+    s = T[2] * B(1, 2) - T[1] * B(2, 2);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[1] * Bf(2, 2) + a[2] * Bf(1, 2) +
+        b[0] * Bf(0, 1) + b[1] * Bf(0, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (0,2) * Bf (0,2);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A1 x B0
+    s = T[0] * B(2, 0) - T[2] * B(0, 0);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(2, 0) + a[2] * Bf(0, 0) +
+        b[1] * Bf(1, 2) + b[2] * Bf(1, 1));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (1,0) * Bf (1,0);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A1 x B1
+    s = T[0] * B(2, 1) - T[2] * B(0, 1);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(2, 1) + a[2] * Bf(0, 1) +
+        b[0] * Bf(1, 2) + b[2] * Bf(1, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (1,1) * Bf (1,1);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A1 x B2
+    s = T[0] * B(2, 2) - T[2] * B(0, 2);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(2, 2) + a[2] * Bf(0, 2) +
+        b[0] * Bf(1, 1) + b[1] * Bf(1, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (1,2) * Bf (1,2);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A2 x B0
+    s = T[1] * B(0, 0) - T[0] * B(1, 0);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(1, 0) + a[1] * Bf(0, 0) +
+        b[1] * Bf(2, 2) + b[2] * Bf(2, 1));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (2,0) * Bf (2,0);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A2 x B1
+    s = T[1] * B(0, 1) - T[0] * B(1, 1);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(1, 1) + a[1] * Bf(0, 1) +
+        b[0] * Bf(2, 2) + b[2] * Bf(2, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (2,1) * Bf (2,1);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A2 x B2
+    s = T[1] * B(0, 2) - T[0] * B(1, 2);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(1, 2) + a[1] * Bf(0, 2) +
+        b[0] * Bf(2, 1) + b[1] * Bf(2, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (2,2) * Bf (2,2);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+
+  }
+
+  // ------------------------ 2 --------------------------------------
+  bool withManualLoopUnrolling_2 (const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, const FCL_REAL& breakDistance2, FCL_REAL& squaredLowerBoundDistance)
+  {
+    Matrix3f Bf (B.cwiseAbs());
+
+    // Corner of b axis aligned bounding box the closest to the origin
+    squaredLowerBoundDistance = _computeDistanceForCase1 (T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    squaredLowerBoundDistance = _computeDistanceForCase2 (B, T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    // A0 x B0
+    FCL_REAL t, s;
+    s = T[2] * B(1, 0) - T[1] * B(2, 0);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    FCL_REAL sinus2;
+    FCL_REAL diff;
+    diff = t - (a[1] * Bf(2, 0) + a[2] * Bf(1, 0) +
+        b[1] * Bf(0, 2) + b[2] * Bf(0, 1));
+    // We need to divide by the norm || A0 x B0 ||
+    // As ||A0|| = ||B0|| = 1,
+    //              2            2
+    // || A0 x B0 ||  + (A0 | B0)  = 1
+    if (diff > 0) {
+      sinus2 = 1 - Bf (0,0) * Bf (0,0);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A0 x B1
+    s = T[2] * B(1, 1) - T[1] * B(2, 1);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[1] * Bf(2, 1) + a[2] * Bf(1, 1) +
+        b[0] * Bf(0, 2) + b[2] * Bf(0, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (0,1) * Bf (0,1);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A0 x B2
+    s = T[2] * B(1, 2) - T[1] * B(2, 2);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[1] * Bf(2, 2) + a[2] * Bf(1, 2) +
+        b[0] * Bf(0, 1) + b[1] * Bf(0, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (0,2) * Bf (0,2);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A1 x B0
+    s = T[0] * B(2, 0) - T[2] * B(0, 0);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(2, 0) + a[2] * Bf(0, 0) +
+        b[1] * Bf(1, 2) + b[2] * Bf(1, 1));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (1,0) * Bf (1,0);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A1 x B1
+    s = T[0] * B(2, 1) - T[2] * B(0, 1);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(2, 1) + a[2] * Bf(0, 1) +
+        b[0] * Bf(1, 2) + b[2] * Bf(1, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (1,1) * Bf (1,1);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A1 x B2
+    s = T[0] * B(2, 2) - T[2] * B(0, 2);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(2, 2) + a[2] * Bf(0, 2) +
+        b[0] * Bf(1, 1) + b[1] * Bf(1, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (1,2) * Bf (1,2);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A2 x B0
+    s = T[1] * B(0, 0) - T[0] * B(1, 0);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(1, 0) + a[1] * Bf(0, 0) +
+        b[1] * Bf(2, 2) + b[2] * Bf(2, 1));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (2,0) * Bf (2,0);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A2 x B1
+    s = T[1] * B(0, 1) - T[0] * B(1, 1);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(1, 1) + a[1] * Bf(0, 1) +
+        b[0] * Bf(2, 2) + b[2] * Bf(2, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (2,1) * Bf (2,1);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A2 x B2
+    s = T[1] * B(0, 2) - T[0] * B(1, 2);
+    t = ((s < 0.0) ? -s : s); assert (t == fabs (s));
+
+    diff = t - (a[0] * Bf(1, 2) + a[1] * Bf(0, 2) +
+        b[0] * Bf(2, 1) + b[1] * Bf(2, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (2,2) * Bf (2,2);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+
+  }
+
+  // ------------------------ 3 --------------------------------------
+  template <int ia, int ib,
+    int ja = (ia+1)%3, int ka = (ia+2)%3,
+    int jb = (ib+1)%3, int kb = (ib+2)%3 >
+  struct loop_case_1 {
+    static inline bool run (
+        const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b,
+        const Matrix3f& Bf,
+        const FCL_REAL& breakDistance2, FCL_REAL& squaredLowerBoundDistance)
+    {
+      const FCL_REAL s = T[ka] * B(ja, ib) - T[ja] * B(ka, ib);
+
+      const FCL_REAL diff = fabs(s) - (a[ja] * Bf(ka, ib) + a[ka] * Bf(ja, ib) +
+                                       b[jb] * Bf(ia, kb) + b[kb] * Bf(ia, jb));
+      // We need to divide by the norm || Aia x Bib ||
+      // As ||Aia|| = ||Bib|| = 1, (Aia | Bib)^2  = cosine^2
+      if (diff > 0) {
+        FCL_REAL sinus2 = 1 - Bf (ia,ib) * Bf (ia,ib);
+        if (sinus2 > 1e-6) {
+          squaredLowerBoundDistance = diff * diff / sinus2;
+          if (squaredLowerBoundDistance > breakDistance2) {
+            return true;
+          }
+        }
+        /* // or
+           FCL_REAL sinus2 = 1 - Bf (ia,ib) * Bf (ia,ib);
+           squaredLowerBoundDistance = diff * diff;
+           if (squaredLowerBoundDistance > breakDistance2 * sinus2) {
+           squaredLowerBoundDistance /= sinus2;
+           return true;
+           }
+        // */
+      }
+      return false;
+    }
+  };
+
+  bool withTemplateLoopUnrolling_1 (const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, const FCL_REAL& breakDistance2, FCL_REAL& squaredLowerBoundDistance)
+  {
+    Matrix3f Bf (B.cwiseAbs());
+
+    // Corner of b axis aligned bounding box the closest to the origin
+    squaredLowerBoundDistance = _computeDistanceForCase1 (T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    squaredLowerBoundDistance = _computeDistanceForCase2 (B, T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    // Ai x Bj
+    if (loop_case_1<0,0>::run (B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+    if (loop_case_1<0,1>::run (B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+    if (loop_case_1<0,2>::run (B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+    if (loop_case_1<1,0>::run (B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+    if (loop_case_1<1,1>::run (B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+    if (loop_case_1<1,2>::run (B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+    if (loop_case_1<2,0>::run (B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+    if (loop_case_1<2,1>::run (B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+    if (loop_case_1<2,2>::run (B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+
+    return false;
+  }
 
-struct Result
+  // ------------------------ 4 --------------------------------------
+
+  template <int ib, int jb = (ib+1)%3, int kb = (ib+2)%3 >
+  struct loop_case_2 
+  {
+    static inline bool run (int ia, int ja, int ka,
+        const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b,
+        const Matrix3f& Bf,
+        const FCL_REAL& breakDistance2, FCL_REAL& squaredLowerBoundDistance)
+    {
+      const FCL_REAL s = T[ka] * B(ja, ib) - T[ja] * B(ka, ib);
+
+      const FCL_REAL diff = fabs(s) - (a[ja] * Bf(ka, ib) + a[ka] * Bf(ja, ib) +
+                                       b[jb] * Bf(ia, kb) + b[kb] * Bf(ia, jb));
+      // We need to divide by the norm || Aia x Bib ||
+      // As ||Aia|| = ||Bib|| = 1, (Aia | Bib)^2  = cosine^2
+      if (diff > 0) {
+        FCL_REAL sinus2 = 1 - Bf (ia,ib) * Bf (ia,ib);
+        if (sinus2 > 1e-6) {
+          squaredLowerBoundDistance = diff * diff / sinus2;
+          if (squaredLowerBoundDistance > breakDistance2) {
+            return true;
+          }
+        }
+        /* // or
+           FCL_REAL sinus2 = 1 - Bf (ia,ib) * Bf (ia,ib);
+           squaredLowerBoundDistance = diff * diff;
+           if (squaredLowerBoundDistance > breakDistance2 * sinus2) {
+           squaredLowerBoundDistance /= sinus2;
+           return true;
+           }
+        // */
+      }
+      return false;
+    }
+  };
+
+  bool withPartialTemplateLoopUnrolling_1 (const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, const FCL_REAL& breakDistance2, FCL_REAL& squaredLowerBoundDistance)
+  {
+    Matrix3f Bf (B.cwiseAbs());
+
+    // Corner of b axis aligned bounding box the closest to the origin
+    squaredLowerBoundDistance = _computeDistanceForCase1 (T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    squaredLowerBoundDistance = _computeDistanceForCase2 (B, T, a, b, Bf);
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    // Ai x Bj
+    int ja = 1, ka = 2;
+    for (int ia = 0; ia < 3; ++ia) {
+      if (loop_case_2<0>::run (ia, ja, ka,
+            B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+      if (loop_case_2<1>::run (ia, ja, ka,
+            B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+      if (loop_case_2<2>::run (ia, ja, ka,
+            B, T, a, b, Bf, breakDistance2, squaredLowerBoundDistance)) return true;
+      ja = ka; ka = ia;
+    }
+
+    return false;
+  }
+
+  // ------------------------ 5 --------------------------------------
+  bool originalWithLowerBound (const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, const FCL_REAL& breakDistance2, FCL_REAL& squaredLowerBoundDistance)
+  {
+    register FCL_REAL t, s;
+    FCL_REAL diff;
+
+    Matrix3f Bf (B.cwiseAbs());
+    squaredLowerBoundDistance = 0;
+
+    // if any of these tests are one-sided, then the polyhedra are disjoint
+
+    // A1 x A2 = A0
+    t = ((T[0] < 0.0) ? -T[0] : T[0]);
+
+    diff = t - (a[0] + Bf.row(0).dot(b));
+    if (diff > 0) {
+      squaredLowerBoundDistance = diff*diff;
+    }
+
+    // A2 x A0 = A1
+    t = ((T[1] < 0.0) ? -T[1] : T[1]);
+
+    diff = t - (a[1] + Bf.row(1).dot(b));
+    if (diff > 0) {
+      squaredLowerBoundDistance += diff*diff;
+    }
+
+    // A0 x A1 = A2
+    t =((T[2] < 0.0) ? -T[2] : T[2]);
+
+    diff = t - (a[2] + Bf.row(2).dot(b));
+    if (diff > 0) {
+      squaredLowerBoundDistance += diff*diff;
+    }
+
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    squaredLowerBoundDistance = 0;
+
+    // B1 x B2 = B0
+    s =  B.col(0).dot(T);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (b[0] + Bf.col(0).dot(a));
+    if (diff > 0) {
+      squaredLowerBoundDistance += diff*diff;
+    }
+
+    // B2 x B0 = B1
+    s = B.col(1).dot(T);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (b[1] + Bf.col(1).dot(a));
+    if (diff > 0) {
+      squaredLowerBoundDistance += diff*diff;
+    }
+
+    // B0 x B1 = B2
+    s = B.col(2).dot(T);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (b[2] + Bf.col(2).dot(a));
+    if (diff > 0) {
+      squaredLowerBoundDistance += diff*diff;
+    }
+
+    if (squaredLowerBoundDistance > breakDistance2)
+      return true;
+
+    // A0 x B0
+    s = T[2] * B(1, 0) - T[1] * B(2, 0);
+    t = ((s < 0.0) ? -s : s);
+
+    FCL_REAL sinus2;
+    diff = t - (a[1] * Bf(2, 0) + a[2] * Bf(1, 0) +
+        b[1] * Bf(0, 2) + b[2] * Bf(0, 1));
+    // We need to divide by the norm || A0 x B0 ||
+    // As ||A0|| = ||B0|| = 1,
+    //              2            2
+    // || A0 x B0 ||  + (A0 | B0)  = 1
+    if (diff > 0) {
+      sinus2 = 1 - Bf (0,0) * Bf (0,0);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A0 x B1
+    s = T[2] * B(1, 1) - T[1] * B(2, 1);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (a[1] * Bf(2, 1) + a[2] * Bf(1, 1) +
+        b[0] * Bf(0, 2) + b[2] * Bf(0, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (0,1) * Bf (0,1);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A0 x B2
+    s = T[2] * B(1, 2) - T[1] * B(2, 2);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (a[1] * Bf(2, 2) + a[2] * Bf(1, 2) +
+        b[0] * Bf(0, 1) + b[1] * Bf(0, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (0,2) * Bf (0,2);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A1 x B0
+    s = T[0] * B(2, 0) - T[2] * B(0, 0);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (a[0] * Bf(2, 0) + a[2] * Bf(0, 0) +
+        b[1] * Bf(1, 2) + b[2] * Bf(1, 1));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (1,0) * Bf (1,0);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A1 x B1
+    s = T[0] * B(2, 1) - T[2] * B(0, 1);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (a[0] * Bf(2, 1) + a[2] * Bf(0, 1) +
+        b[0] * Bf(1, 2) + b[2] * Bf(1, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (1,1) * Bf (1,1);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A1 x B2
+    s = T[0] * B(2, 2) - T[2] * B(0, 2);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (a[0] * Bf(2, 2) + a[2] * Bf(0, 2) +
+        b[0] * Bf(1, 1) + b[1] * Bf(1, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (1,2) * Bf (1,2);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A2 x B0
+    s = T[1] * B(0, 0) - T[0] * B(1, 0);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (a[0] * Bf(1, 0) + a[1] * Bf(0, 0) +
+        b[1] * Bf(2, 2) + b[2] * Bf(2, 1));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (2,0) * Bf (2,0);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A2 x B1
+    s = T[1] * B(0, 1) - T[0] * B(1, 1);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (a[0] * Bf(1, 1) + a[1] * Bf(0, 1) +
+        b[0] * Bf(2, 2) + b[2] * Bf(2, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (2,1) * Bf (2,1);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    // A2 x B2
+    s = T[1] * B(0, 2) - T[0] * B(1, 2);
+    t = ((s < 0.0) ? -s : s);
+
+    diff = t - (a[0] * Bf(1, 2) + a[1] * Bf(0, 2) +
+        b[0] * Bf(2, 1) + b[1] * Bf(2, 0));
+    if (diff > 0) {
+      sinus2 = 1 - Bf (2,2) * Bf (2,2);
+      if (sinus2 > 1e-6) {
+        squaredLowerBoundDistance = diff * diff / sinus2;
+        if (squaredLowerBoundDistance > breakDistance2) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+
+  }
+
+  // ------------------------ 6 --------------------------------------
+  bool originalWithNoLowerBound (const Matrix3f& B, const Vec3f& T, const Vec3f& a, const Vec3f& b, const FCL_REAL& , FCL_REAL& squaredLowerBoundDistance)
+  {
+    register FCL_REAL t, s;
+    const FCL_REAL reps = 1e-6;
+
+    squaredLowerBoundDistance = 0;
+
+    Matrix3f Bf (B.array().abs() + reps);
+    // Bf += reps;
+
+    // if any of these tests are one-sided, then the polyhedra are disjoint
+
+    // A1 x A2 = A0
+    t = ((T[0] < 0.0) ? -T[0] : T[0]);
+
+    // if(t > (a[0] + Bf.dotX(b)))
+    if(t > (a[0] + Bf.row(0).dot(b)))
+      return true;
+
+    // B1 x B2 = B0
+    // s =  B.transposeDotX(T);
+    s =  B.col(0).dot(T);
+    t = ((s < 0.0) ? -s : s);
+
+    // if(t > (b[0] + Bf.transposeDotX(a)))
+    if(t > (b[0] + Bf.col(0).dot(a)))
+      return true;
+
+    // A2 x A0 = A1
+    t = ((T[1] < 0.0) ? -T[1] : T[1]);
+
+    // if(t > (a[1] + Bf.dotY(b)))
+    if(t > (a[1] + Bf.row(1).dot(b)))
+      return true;
+
+    // A0 x A1 = A2
+    t =((T[2] < 0.0) ? -T[2] : T[2]);
+
+    // if(t > (a[2] + Bf.dotZ(b)))
+    if(t > (a[2] + Bf.row(2).dot(b)))
+      return true;
+
+    // B2 x B0 = B1
+    // s = B.transposeDotY(T);
+    s = B.col(1).dot(T);
+    t = ((s < 0.0) ? -s : s);
+
+    // if(t > (b[1] + Bf.transposeDotY(a)))
+    if(t > (b[1] + Bf.col(1).dot(a)))
+      return true;
+
+    // B0 x B1 = B2
+    // s = B.transposeDotZ(T);
+    s = B.col(2).dot(T);
+    t = ((s < 0.0) ? -s : s);
+
+    // if(t > (b[2] + Bf.transposeDotZ(a)))
+    if(t > (b[2] + Bf.col(2).dot(a)))
+      return true;
+
+    // A0 x B0
+    s = T[2] * B(1, 0) - T[1] * B(2, 0);
+    t = ((s < 0.0) ? -s : s);
+
+    if(t > (a[1] * Bf(2, 0) + a[2] * Bf(1, 0) +
+          b[1] * Bf(0, 2) + b[2] * Bf(0, 1)))
+      return true;
+
+    // A0 x B1
+    s = T[2] * B(1, 1) - T[1] * B(2, 1);
+    t = ((s < 0.0) ? -s : s);
+
+    if(t > (a[1] * Bf(2, 1) + a[2] * Bf(1, 1) +
+          b[0] * Bf(0, 2) + b[2] * Bf(0, 0)))
+      return true;
+
+    // A0 x B2
+    s = T[2] * B(1, 2) - T[1] * B(2, 2);
+    t = ((s < 0.0) ? -s : s);
+
+    if(t > (a[1] * Bf(2, 2) + a[2] * Bf(1, 2) +
+          b[0] * Bf(0, 1) + b[1] * Bf(0, 0)))
+      return true;
+
+    // A1 x B0
+    s = T[0] * B(2, 0) - T[2] * B(0, 0);
+    t = ((s < 0.0) ? -s : s);
+
+    if(t > (a[0] * Bf(2, 0) + a[2] * Bf(0, 0) +
+          b[1] * Bf(1, 2) + b[2] * Bf(1, 1)))
+      return true;
+
+    // A1 x B1
+    s = T[0] * B(2, 1) - T[2] * B(0, 1);
+    t = ((s < 0.0) ? -s : s);
+
+    if(t > (a[0] * Bf(2, 1) + a[2] * Bf(0, 1) +
+          b[0] * Bf(1, 2) + b[2] * Bf(1, 0)))
+      return true;
+
+    // A1 x B2
+    s = T[0] * B(2, 2) - T[2] * B(0, 2);
+    t = ((s < 0.0) ? -s : s);
+
+    if(t > (a[0] * Bf(2, 2) + a[2] * Bf(0, 2) +
+          b[0] * Bf(1, 1) + b[1] * Bf(1, 0)))
+      return true;
+
+    // A2 x B0
+    s = T[1] * B(0, 0) - T[0] * B(1, 0);
+    t = ((s < 0.0) ? -s : s);
+
+    if(t > (a[0] * Bf(1, 0) + a[1] * Bf(0, 0) +
+          b[1] * Bf(2, 2) + b[2] * Bf(2, 1)))
+      return true;
+
+    // A2 x B1
+    s = T[1] * B(0, 1) - T[0] * B(1, 1);
+    t = ((s < 0.0) ? -s : s);
+
+    if(t > (a[0] * Bf(1, 1) + a[1] * Bf(0, 1) +
+          b[0] * Bf(2, 2) + b[2] * Bf(2, 0)))
+      return true;
+
+    // A2 x B2
+    s = T[1] * B(0, 2) - T[0] * B(1, 2);
+    t = ((s < 0.0) ? -s : s);
+
+    if(t > (a[0] * Bf(1, 2) + a[1] * Bf(0, 2) +
+          b[0] * Bf(2, 1) + b[1] * Bf(2, 0)))
+      return true;
+
+    return false;
+
+  }
+}
+
+struct BenchmarkResult
 {
-  bool overlap;
+  /// The test ID:
+  /// - 0-10 identifies a separating axes.
+  /// - 11 means no separatins axes could be found. (distance should be <= 0)
+  int ifId;
   FCL_REAL distance;
+  FCL_REAL squaredLowerBoundDistance;
+  duration_type duration[NB_METHODS];
+  bool failure;
+
+  static std::ostream& headers (std::ostream& os)
+  {
+    duration_type dummy (1);
+    std::string unit = " (" + boost::chrono::duration_units_default<>().get_unit (boost::chrono::duration_style::symbol, dummy) + ")";
+    os << boost::chrono::symbol_format
+      << "separating axis"
+      << sep << "distance lower bound"
+      << sep << "distance"
+      << sep << "failure"
+      << sep << "Runtime Loop" << unit
+      << sep << "Manual Loop Unrolling 1" << unit
+      << sep << "Manual Loop Unrolling 2" << unit
+      << sep << "Template Unrolling" << unit
+      << sep << "Partial Template Unrolling" << unit
+      << sep << "Original (LowerBound)" << unit
+      << sep << "Original (NoLowerBound)" << unit
+      ;
+    return os;
+  }
+
+  std::ostream& print (std::ostream& os) const
+  {
+    os << ifId
+      << sep << std::sqrt(squaredLowerBoundDistance)
+      << sep << distance
+      << sep << failure;
+    for (int i = 0; i < NB_METHODS; ++i) os << sep << duration[i].count();
+    return os;
+  }
 };
 
-BOOST_AUTO_TEST_CASE(obb_overlap)
+std::ostream& operator<< (std::ostream& os, const BenchmarkResult& br)
 {
-  static const size_t nbSamples = 10000;
-  Sample sample [nbSamples];
-  FCL_REAL range = 2;
-  FCL_REAL sumDistance = 0, sumDistanceLowerBound = 0;
-  for (std::size_t i=0; i<nbSamples; ++i) {
-    // sample unit quaternion
-    FCL_REAL u1 = (FCL_REAL) rand() / RAND_MAX;
-    FCL_REAL u2 = (FCL_REAL) rand() / RAND_MAX;
-    FCL_REAL u3 = (FCL_REAL) rand() / RAND_MAX;
-
-    FCL_REAL q1 = sqrt (1-u1)*sin(2*M_PI*u2);
-    FCL_REAL q2 = sqrt (1-u1)*cos(2*M_PI*u2);
-    FCL_REAL q3 = sqrt (u1) * sin(2*M_PI*u3);
-    FCL_REAL q4 = sqrt (u1) * cos(2*M_PI*u3);
-    hpp::fcl::Quaternion3f (q1, q2, q3, q4).toRotation (sample [i].R);
-
-    // sample translation
-    sample [i].T = hpp::fcl::Vec3f ((FCL_REAL) range * rand() / RAND_MAX,
-			       (FCL_REAL) range * rand() / RAND_MAX,
-			       (FCL_REAL) range * rand() / RAND_MAX);
-
-    // sample extents
-    sample [i].extent1 = hpp::fcl::Vec3f ((FCL_REAL) rand() / RAND_MAX,
-				     (FCL_REAL) rand() / RAND_MAX,
-				     (FCL_REAL) rand() / RAND_MAX);
-
-    sample [i].extent2 = hpp::fcl::Vec3f ((FCL_REAL) rand() / RAND_MAX,
-				     (FCL_REAL) rand() / RAND_MAX,
-				     (FCL_REAL) rand() / RAND_MAX);
-  }
-
-  // Compute result for each function
-  Result resultDistance [nbSamples];
-  Result resultDisjoint [nbSamples];
-  Result resultDisjointAndLowerBound [nbSamples];
-
-  hpp::fcl::Transform3f tf1;
-  hpp::fcl::Transform3f tf2;
-  hpp::fcl::Box box1 (0, 0, 0);
-  hpp::fcl::Box box2 (0, 0, 0);
-  hpp::fcl::DistanceRequest request (false, 0, 0, hpp::fcl::GST_INDEP);
-  hpp::fcl::DistanceResult result;
-  FCL_REAL distance;
-  FCL_REAL squaredDistance;
-  timeval t0, t1;
-  hpp::fcl::GJKSolver_indep gjkSolver;
-
-  // ShapeShapeDistance
-  gettimeofday (&t0, NULL);
-  for (std::size_t i=0; i<nbSamples; ++i) {
-    box1.halfSide = sample [i].extent1;
-    box2.halfSide = sample [i].extent2;
-    tf2.setTransform (sample [i].R, sample [i].T);
-    resultDistance [i].distance =
-      hpp::fcl::ShapeShapeDistance<hpp::fcl::Box, hpp::fcl::Box, hpp::fcl::GJKSolver_indep>
-      (&box1, tf1, &box2, tf2, &gjkSolver, request, result);
-    resultDistance [i].overlap = (resultDistance [i].distance < 0);
-    if (resultDistance [i].distance < 0) {
-      resultDistance [i].distance = 0;
-    }
-    result.clear ();
-  }    
-  gettimeofday (&t1, NULL);
-  double t = (t1.tv_sec - t0.tv_sec) + 1e-6*(t1.tv_usec - t0.tv_usec + 0.);
-  std::cout << "Time for " << nbSamples
-	    << " calls to ShapeShapeDistance (in seconds): "
-	    << t << std::endl;
-
-  // obbDisjointAndLowerBoundDistance
-  gettimeofday (&t0, NULL);
-  for (std::size_t i=0; i<nbSamples; ++i) {
-    resultDisjointAndLowerBound [i].overlap =
-      !obbDisjointAndLowerBoundDistance (sample [i].R, sample [i].T,
-					 sample [i].extent1,
-					 sample [i].extent2,
-					 squaredDistance);
-    resultDisjointAndLowerBound [i].distance = sqrt (squaredDistance);
-  }    
-  gettimeofday (&t1, NULL);
-  t = (t1.tv_sec - t0.tv_sec) + 1e-6*(t1.tv_usec - t0.tv_usec + 0.);
-  std::cout << "Time for " << nbSamples
-	    << " calls to obbDisjointAndLowerBoundDistance"
-	    << " (in seconds): "
-	    << t << std::endl;
-
-  gettimeofday (&t0, NULL);
-  for (std::size_t i=0; i<nbSamples; ++i) {
-    resultDisjoint [i].overlap =
-      !obbDisjoint (sample [i].R, sample [i].T,
-		    sample [i].extent1,
-		    sample [i].extent2);
-    resultDisjoint [i].distance = 0;
-  }    
-  gettimeofday (&t1, NULL);
-  t = (t1.tv_sec - t0.tv_sec) + 1e-6*(t1.tv_usec - t0.tv_usec + 0.);
-  std::cout << "Time for " << nbSamples << " calls to obbDisjoint"
-	    << " (in seconds): "
-	    << t << std::endl;
-
-  // Test correctness of results
-  bool res1, res2, res3;
-  for (std::size_t i=0; i<nbSamples; ++i) {
-    res1 = resultDisjoint [i].overlap;
-    res2 = resultDisjointAndLowerBound [i].overlap;
-    res3 = resultDistance [i].overlap;
-    sumDistanceLowerBound += resultDisjointAndLowerBound [i].distance;
-    sumDistance += resultDistance [i].distance;
-
-#if 0
-    std::cout << "ShapeShapeDistance:    overlap: "
-	      << resultDistance [i].overlap
-	      << ", distance: " << resultDistance [i].distance << std::endl;
-    std::cout << "disjoint:              overlap: "
-	      << resultDisjoint [i].overlap
-	      << ", distance: " << resultDisjoint [i].distance << std::endl;
-    std::cout << "disjointAndLowerBound: overlap: "
-	      << resultDisjointAndLowerBound [i].overlap
-	      << ", distance: " << resultDisjointAndLowerBound [i].distance
-	      << std::endl << std::endl;
-#endif
-    BOOST_CHECK (res1 == res2);
-    BOOST_CHECK (res1 == res3);
-    BOOST_CHECK ((!res1 && resultDisjointAndLowerBound [i].distance > 0) ||
-		 (res1 && resultDisjointAndLowerBound [i].distance == 0));
-    BOOST_CHECK (resultDisjointAndLowerBound [i].distance <=
-		 resultDistance [i].distance);
-  }
-  std::cout << "Sum distances ShapeShapeDistance:               "
-	    << sumDistance << std::endl;
-  std::cout << "Sum distances obbDisjointAndLowerBoundDistance: "
-	    << sumDistanceLowerBound << std::endl;
+  return br.print (os);
+}
+
+BenchmarkResult benchmark_obb_case (
+    const Matrix3f& B, const Vec3f& T,
+    const Vec3f& a, const Vec3f& b,
+    const CollisionRequest& request,
+    std::size_t N)
+{
+  const FCL_REAL breakDistance (request.break_distance + request.security_margin);
+  const FCL_REAL breakDistance2 = breakDistance * breakDistance;
+
+  BenchmarkResult result;
+  // First determine which axis provide the answer
+  result.ifId = obbDisjoint_impls::separatingAxisId (B, T, a, b, breakDistance2,
+      result.squaredLowerBoundDistance);
+  bool disjoint = obbDisjoint_impls::distance (B, T, a, b, result.distance);
+  assert (0 <= result.ifId && result.ifId <= 11);
+
+  // Sanity check
+  result.failure = true;
+  bool overlap = (result.ifId == 11); 
+  FCL_REAL dist_thr = request.break_distance + request.security_margin;
+  if (       !overlap && result.distance <= 0) {
+    std::cerr << "Failure: negative distance for disjoint OBBs.";
+  } else if (!overlap && result.squaredLowerBoundDistance < 0) {
+    std::cerr << "Failure: negative distance lower bound.";
+  } else if (!overlap && eps < std::sqrt (result.squaredLowerBoundDistance) - result.distance) {
+    std::cerr << "Failure: distance is inferior to its lower bound (diff = " << 
+      std::sqrt (result.squaredLowerBoundDistance) - result.distance << ").";
+  } else if (overlap != !disjoint && result.distance >= dist_thr - eps) {
+    std::cerr << "Failure: overlapping test and distance query mismatch.";
+  } else if (overlap              && result.distance >= dist_thr - eps) {
+    std::cerr << "Failure: positive distance for overlapping OBBs.";
+  } else {
+    result.failure = false;
+  }
+  if (result.failure) {
+    std::cerr
+      << "\nR = " << Quaternion3f(B).coeffs().transpose().format (py_fmt)
+      << "\nT = " << T.transpose().format (py_fmt)
+      << "\na = " << a.transpose().format (py_fmt)
+      << "\nb = " << b.transpose().format (py_fmt)
+      << "\nresult = " << result
+      << '\n' << std::endl;
+  }
+
+  // Compute time
+  FCL_REAL tmp;
+  clock_type::time_point start, end;
+
+  // ------------------------ 0 --------------------------------------
+  start = clock_type::now();
+  for (std::size_t i = 0; i < N; ++i)
+    obbDisjoint_impls::withRuntimeLoop (B, T, a, b, breakDistance2, tmp);
+  end = clock_type::now();
+  result.duration[0] = (end - start) / N;
+
+  // ------------------------ 1 --------------------------------------
+  start = clock_type::now();
+  for (std::size_t i = 0; i < N; ++i)
+    obbDisjoint_impls::withManualLoopUnrolling_1 (B, T, a, b, breakDistance2, tmp);
+  end = clock_type::now();
+  result.duration[1] = (end - start) / N;
+
+  // ------------------------ 2 --------------------------------------
+  start = clock_type::now();
+  for (std::size_t i = 0; i < N; ++i)
+    obbDisjoint_impls::withManualLoopUnrolling_2 (B, T, a, b, breakDistance2, tmp);
+  end = clock_type::now();
+  result.duration[2] = (end - start) / N;
+
+  // ------------------------ 3 --------------------------------------
+  start = clock_type::now();
+  for (std::size_t i = 0; i < N; ++i)
+    obbDisjoint_impls::withTemplateLoopUnrolling_1 (B, T, a, b, breakDistance2, tmp);
+  end = clock_type::now();
+  result.duration[3] = (end - start) / N;
+
+  // ------------------------ 4 --------------------------------------
+  start = clock_type::now();
+  for (std::size_t i = 0; i < N; ++i)
+    obbDisjoint_impls::withPartialTemplateLoopUnrolling_1 (B, T, a, b, breakDistance2, tmp);
+  end = clock_type::now();
+  result.duration[4] = (end - start) / N;
+
+  // ------------------------ 5 --------------------------------------
+  start = clock_type::now();
+  for (std::size_t i = 0; i < N; ++i)
+    obbDisjoint_impls::originalWithLowerBound (B, T, a, b, breakDistance2, tmp);
+  end = clock_type::now();
+  result.duration[5] = (end - start) / N;
+
+  // ------------------------ 6 --------------------------------------
+  start = clock_type::now();
+  // The 2 last arguments are unused.
+  for (std::size_t i = 0; i < N; ++i)
+    obbDisjoint_impls::originalWithNoLowerBound (B, T, a, b, breakDistance2, tmp);
+  end = clock_type::now();
+  result.duration[6] = (end - start) / N;
+
+  return result;
+}
+
+std::size_t obb_overlap_and_lower_bound_distance()
+{
+  std::size_t nbFailure = 0;
+
+  // Create two OBBs axis
+  Vec3f a, b;
+  Matrix3f B;
+  Vec3f T;
+  CollisionRequest request;
+
+  static const size_t nbRandomOBB       = 100;
+  static const size_t nbTransformPerOBB = 1000;
+  static const size_t nbRunForTimeMeas  = 10000;
+  static const FCL_REAL extentNorm = 1.;
+
+  std::ostream& output = std::cout;
+  output << BenchmarkResult::headers << '\n';
+  
+  BenchmarkResult result;
+  for (std::size_t iobb = 0; iobb < nbRandomOBB; ++iobb) {
+    randomOBBs (a, b, extentNorm);
+    for (std::size_t itf = 0; itf < nbTransformPerOBB; ++itf) {
+      randomTransform (B, T, a, b, extentNorm);
+      result = benchmark_obb_case (B, T, a, b, request, nbRunForTimeMeas);
+      output << result << '\n';
+      if (result.failure) nbFailure++;
+    }
+  }
+  return nbFailure;
+}
+
+int main ()
+{
+  bool cpuScalingEnabled = checkCpuScalingEnabled();
+  if (cpuScalingEnabled)
+    std::cerr <<
+      "CPU scaling is enabled."
+      "\n\tThe benchmark real time measurements may be noisy and will incur extra overhead."
+      "\n\tUse the following commands to turn on and off."
+      "\n\t\tsudo cpufreq-set --governor performance"
+      "\n\t\tsudo cpufreq-set --governor powersave"
+      "\n"
+      ;
+
+  std::size_t nbFailure = obb_overlap_and_lower_bound_distance();
+  if (nbFailure > INT_MAX) return INT_MAX;
+  return (int)nbFailure;
 }