From 472416d88063bfe68bb8c2b227971eea096c63d8 Mon Sep 17 00:00:00 2001
From: Chris Laurel <claurel@gmail.com>
Date: Thu, 14 Jan 2010 00:13:51 +0000
Subject: [PATCH] Updated Eigen to version 2.0.11, which features precision
 fixes for quaternion slerp and 4x4 matrix inversion.

---
 thirdparty/Eigen/Eigen/src/Core/Part.h        |   2 +-
 thirdparty/Eigen/Eigen/src/Core/util/Macros.h |   2 +-
 thirdparty/Eigen/Eigen/src/Core/util/Memory.h |  49 +++-
 .../Eigen/Eigen/src/Geometry/Hyperplane.h     |   4 +-
 .../Eigen/Eigen/src/Geometry/Quaternion.h     |  31 +-
 thirdparty/Eigen/Eigen/src/LU/Inverse.h       | 224 +++++++++------
 .../Eigen/Eigen/src/Sparse/TriangularSolver.h |   3 +
 thirdparty/curveplot/include/curveplot.h      |  12 +
 thirdparty/curveplot/src/curveplot.cpp        | 267 +++++++++++++++++-
 9 files changed, 484 insertions(+), 110 deletions(-)
diff --git a/thirdparty/Eigen/Eigen/src/Core/Part.h b/thirdparty/Eigen/Eigen/src/Core/Part.h
index 96229f43b..65f4bc194 100644
--- a/thirdparty/Eigen/Eigen/src/Core/Part.h
+++ b/thirdparty/Eigen/Eigen/src/Core/Part.h
@@ -50,7 +50,7 @@ struct ei_traits<Part<MatrixType, Mode> > : ei_traits<MatrixType>
   typedef typename ei_unref<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     Flags = (_MatrixTypeNested::Flags & (HereditaryBits) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit))) | Mode,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    CoeffReadCost = _MatrixTypeNested::CoeffReadCost + ConditionalJumpCost
   };
 };
 
diff --git a/thirdparty/Eigen/Eigen/src/Core/util/Macros.h b/thirdparty/Eigen/Eigen/src/Core/util/Macros.h
index cdafc73f3..6a0d692a6 100644
--- a/thirdparty/Eigen/Eigen/src/Core/util/Macros.h
+++ b/thirdparty/Eigen/Eigen/src/Core/util/Macros.h
@@ -30,7 +30,7 @@
 
 #define EIGEN_WORLD_VERSION 2
 #define EIGEN_MAJOR_VERSION 0
-#define EIGEN_MINOR_VERSION 10
+#define EIGEN_MINOR_VERSION 11
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
diff --git a/thirdparty/Eigen/Eigen/src/Core/util/Memory.h b/thirdparty/Eigen/Eigen/src/Core/util/Memory.h
index 0a43e7f7b..d12a2ec26 100644
--- a/thirdparty/Eigen/Eigen/src/Core/util/Memory.h
+++ b/thirdparty/Eigen/Eigen/src/Core/util/Memory.h
@@ -209,18 +209,47 @@ template<typename T, bool Align> inline void ei_conditional_aligned_delete(T *pt
   ei_conditional_aligned_free<Align>(ptr);
 }
 
-/** \internal \returns the number of elements which have to be skipped such that data are 16 bytes aligned */
-template<typename Scalar>
-inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset)
+/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+  *
+  * \param array the address of the start of the array
+  * \param size the size of the array
+  *
+  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
+  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+  * packet size for the given scalar type is 1, then everything is considered well-aligned.
+  *
+  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
+  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
+  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * example with Scalar=double on certain 32-bit platforms, see bug #79.
+  *
+  * There is also the variant ei_first_aligned(const MatrixBase&, Integer) defined in Coeffs.h.
+  */
+template<typename Scalar, typename Integer>
+inline static Integer ei_alignmentOffset(const Scalar* array, Integer size)
 {
   typedef typename ei_packet_traits<Scalar>::type Packet;
-  const int PacketSize = ei_packet_traits<Scalar>::size;
-  const int PacketAlignedMask = PacketSize-1;
-  const bool Vectorized = PacketSize>1;
-  return Vectorized
-          ? std::min<int>( (PacketSize - (int((size_t(ptr)/sizeof(Scalar))) & PacketAlignedMask))
-                           & PacketAlignedMask, maxOffset)
-          : 0;
+  enum { PacketSize = ei_packet_traits<Scalar>::size,
+         PacketAlignedMask = PacketSize-1
+  };
+  
+  if(PacketSize==1)
+  {
+    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
+    // of the array have the same aligment.
+    return 0;
+  }
+  else if(size_t(array) & (sizeof(Scalar)-1))
+  {
+    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+    // Consequently, no element of the array is well aligned.
+    return size;
+  }
+  else
+  {
+    return std::min<Integer>( (PacketSize - (Integer((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
+                           & PacketAlignedMask, size);
+  }
 }
 
 /** \internal
diff --git a/thirdparty/Eigen/Eigen/src/Geometry/Hyperplane.h b/thirdparty/Eigen/Eigen/src/Geometry/Hyperplane.h
index 22c530d4b..b1b26062f 100644
--- a/thirdparty/Eigen/Eigen/src/Geometry/Hyperplane.h
+++ b/thirdparty/Eigen/Eigen/src/Geometry/Hyperplane.h
@@ -52,9 +52,9 @@ public:
   typedef _Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime==Dynamic
+  typedef Matrix<Scalar,int(AmbientDimAtCompileTime)==Dynamic
                         ? Dynamic
-                        : AmbientDimAtCompileTime+1,1> Coefficients;
+                        : int(AmbientDimAtCompileTime)+1,1> Coefficients;
   typedef Block<Coefficients,AmbientDimAtCompileTime,1> NormalReturnType;
 
   /** Default constructor without initialization */
diff --git a/thirdparty/Eigen/Eigen/src/Geometry/Quaternion.h b/thirdparty/Eigen/Eigen/src/Geometry/Quaternion.h
index 3fcbff4e7..51dfc270c 100644
--- a/thirdparty/Eigen/Eigen/src/Geometry/Quaternion.h
+++ b/thirdparty/Eigen/Eigen/src/Geometry/Quaternion.h
@@ -450,22 +450,31 @@ inline Scalar Quaternion<Scalar>::angularDistance(const Quaternion& other) const
 template <typename Scalar>
 Quaternion<Scalar> Quaternion<Scalar>::slerp(Scalar t, const Quaternion& other) const
 {
-  static const Scalar one = Scalar(1) - precision<Scalar>();
+  static const Scalar one = Scalar(1) - machine_epsilon<Scalar>();
   Scalar d = this->dot(other);
   Scalar absD = ei_abs(d);
+
+  Scalar scale0;
+  Scalar scale1;
+
   if (absD>=one)
-    return *this;
+  {
+    scale0 = Scalar(1) - t;
+    scale1 = t;
+  }
+  else
+  {
+    // theta is the angle between the 2 quaternions
+    Scalar theta = std::acos(absD);
+    Scalar sinTheta = ei_sin(theta);
 
-  // theta is the angle between the 2 quaternions
-  Scalar theta = std::acos(absD);
-  Scalar sinTheta = ei_sin(theta);
+    scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
+    scale1 = ei_sin( ( t * theta) ) / sinTheta;
+    if (d<0)
+      scale1 = -scale1;
+  }
 
-  Scalar scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
-  Scalar scale1 = ei_sin( ( t * theta) ) / sinTheta;
-  if (d<0)
-    scale1 = -scale1;
-
-  return Quaternion(scale0 * m_coeffs + scale1 * other.m_coeffs);
+  return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
 }
 
 // set from a rotation matrix
diff --git a/thirdparty/Eigen/Eigen/src/LU/Inverse.h b/thirdparty/Eigen/Eigen/src/LU/Inverse.h
index 2f15f0953..98e7fc5d0 100644
--- a/thirdparty/Eigen/Eigen/src/LU/Inverse.h
+++ b/thirdparty/Eigen/Eigen/src/LU/Inverse.h
@@ -29,8 +29,8 @@
 *** Part 1 : optimized implementations for fixed-size 2,3,4 cases ***
 ********************************************************************/
 
-template<typename MatrixType>
-void ei_compute_inverse_in_size2_case(const MatrixType& matrix, MatrixType* result)
+template<typename XprType, typename MatrixType>
+void ei_compute_inverse_in_size2_case(const XprType& matrix, MatrixType* result)
 {
   typedef typename MatrixType::Scalar Scalar;
   const Scalar invdet = Scalar(1) / matrix.determinant();
@@ -75,99 +75,155 @@ void ei_compute_inverse_in_size3_case(const MatrixType& matrix, MatrixType* resu
   result->coeffRef(2, 2) = matrix.minor(2,2).determinant() * invdet;
 }
 
-template<typename MatrixType>
-bool ei_compute_inverse_in_size4_case_helper(const MatrixType& matrix, MatrixType* result)
+template<typename MatrixType, typename Scalar = typename MatrixType::Scalar>
+struct ei_compute_inverse_in_size4_case
 {
-  /* Let's split M into four 2x2 blocks:
-    * (P Q)
-    * (R S)
-    * If P is invertible, with inverse denoted by P_inverse, and if
-    * (S - R*P_inverse*Q) is also invertible, then the inverse of M is
-    * (P' Q')
-    * (R' S')
-    * where
-    * S' = (S - R*P_inverse*Q)^(-1)
-    * P' = P1 + (P1*Q) * S' *(R*P_inverse)
-    * Q' = -(P_inverse*Q) * S'
-    * R' = -S' * (R*P_inverse)
-    */
-  typedef Block<MatrixType,2,2> XprBlock22;
-  typedef typename MatrixBase<XprBlock22>::PlainMatrixType Block22;
-  Block22 P_inverse;
-  if(ei_compute_inverse_in_size2_case_with_check(matrix.template block<2,2>(0,0), &P_inverse))
+  static void run(const MatrixType& matrix, MatrixType& result)
   {
-    const Block22 Q = matrix.template block<2,2>(0,2);
-    const Block22 P_inverse_times_Q = P_inverse * Q;
-    const XprBlock22 R = matrix.template block<2,2>(2,0);
-    const Block22 R_times_P_inverse = R * P_inverse;
-    const Block22 R_times_P_inverse_times_Q = R_times_P_inverse * Q;
-    const XprBlock22 S = matrix.template block<2,2>(2,2);
-    const Block22 X = S - R_times_P_inverse_times_Q;
-    Block22 Y;
-    ei_compute_inverse_in_size2_case(X, &Y);
-    result->template block<2,2>(2,2) = Y;
-    result->template block<2,2>(2,0) = - Y * R_times_P_inverse;
-    const Block22 Z = P_inverse_times_Q * Y;
-    result->template block<2,2>(0,2) = - Z;
-    result->template block<2,2>(0,0) = P_inverse + Z * R_times_P_inverse;
-    return true;
+    result.coeffRef(0,0) = matrix.minor(0,0).determinant();
+    result.coeffRef(1,0) = -matrix.minor(0,1).determinant();
+    result.coeffRef(2,0) = matrix.minor(0,2).determinant();
+    result.coeffRef(3,0) = -matrix.minor(0,3).determinant();
+    result.coeffRef(0,2) = matrix.minor(2,0).determinant();
+    result.coeffRef(1,2) = -matrix.minor(2,1).determinant();
+    result.coeffRef(2,2) = matrix.minor(2,2).determinant();
+    result.coeffRef(3,2) = -matrix.minor(2,3).determinant();
+    result.coeffRef(0,1) = -matrix.minor(1,0).determinant();
+    result.coeffRef(1,1) = matrix.minor(1,1).determinant();
+    result.coeffRef(2,1) = -matrix.minor(1,2).determinant();
+    result.coeffRef(3,1) = matrix.minor(1,3).determinant();
+    result.coeffRef(0,3) = -matrix.minor(3,0).determinant();
+    result.coeffRef(1,3) = matrix.minor(3,1).determinant();
+    result.coeffRef(2,3) = -matrix.minor(3,2).determinant();
+    result.coeffRef(3,3) = matrix.minor(3,3).determinant();
+    result /= (matrix.col(0).cwise()*result.row(0).transpose()).sum();
   }
-  else
-  {
-    return false;
-  }
-}
+};
 
+#ifdef EIGEN_VECTORIZE_SSE
+// The SSE code for the 4x4 float matrix inverse in this file comes from the file
+//   ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
+// its copyright information is:
+//   Copyright (C) 1999 Intel Corporation
+// See page ii of that document for legal stuff. Not being lawyers, we just assume
+// here that if Intel makes this document publically available, with source code
+// and detailed explanations, it's because they want their CPUs to be fed with
+// good code, and therefore they presumably don't mind us using it in Eigen.
 template<typename MatrixType>
-void ei_compute_inverse_in_size4_case(const MatrixType& _matrix, MatrixType* result)
+struct ei_compute_inverse_in_size4_case<MatrixType, float>
 {
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
+  static void run(const MatrixType& matrix, MatrixType& result)
+  {
+    // Variables (Streaming SIMD Extensions registers) which will contain cofactors and, later, the
+    // lines of the inverted matrix.
+    __m128 minor0, minor1, minor2, minor3;
 
-  // we will do row permutations on the matrix. This copy should have negligible cost.
-  // if not, consider working in-place on the matrix (const-cast it, but then undo the permutations
-  // to nevertheless honor constness)
-  typename MatrixType::PlainMatrixType matrix(_matrix);
+    // Variables which will contain the lines of the reference matrix and, later (after the transposition),
+    // the columns of the original matrix.
+    __m128 row0, row1, row2, row3;
 
-  // let's extract from the 2 first colums a 2x2 block whose determinant is as big as possible.
-  int good_row0, good_row1, good_i;
-  Matrix<RealScalar,6,1> absdet;
+    // Temporary variables and the variable that will contain the matrix determinant.
+    __m128 det, tmp1;
 
-  // any 2x2 block with determinant above this threshold will be considered good enough.
-  // The magic value 1e-1 here comes from experimentation. The bigger it is, the higher the precision,
-  // the slower the computation. This value 1e-1 gives precision almost as good as the brutal cofactors
-  // algorithm, both in average and in worst-case precision.
-  RealScalar d = (matrix.col(0).squaredNorm()+matrix.col(1).squaredNorm()) * RealScalar(1e-1);
-  #define ei_inv_size4_helper_macro(i,row0,row1) \
-  absdet[i] = ei_abs(matrix.coeff(row0,0)*matrix.coeff(row1,1) \
-                                - matrix.coeff(row0,1)*matrix.coeff(row1,0)); \
-  if(absdet[i] > d) { good_row0=row0; good_row1=row1; goto good; }
-  ei_inv_size4_helper_macro(0,0,1)
-  ei_inv_size4_helper_macro(1,0,2)
-  ei_inv_size4_helper_macro(2,0,3)
-  ei_inv_size4_helper_macro(3,1,2)
-  ei_inv_size4_helper_macro(4,1,3)
-  ei_inv_size4_helper_macro(5,2,3)
+    // Matrix transposition
+    const float *src = matrix.data();
+    tmp1  = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)src)), (__m64*)(src+ 4));
+    row1  = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+8))), (__m64*)(src+12));
+    row0  = _mm_shuffle_ps(tmp1, row1, 0x88);
+    row1  = _mm_shuffle_ps(row1, tmp1, 0xDD);
+    tmp1  = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+ 2))), (__m64*)(src+ 6));
+    row3  = _mm_loadh_pi(_mm_castpd_ps(_mm_load_sd((double*)(src+10))), (__m64*)(src+14));
+    row2  = _mm_shuffle_ps(tmp1, row3, 0x88);
+    row3  = _mm_shuffle_ps(row3, tmp1, 0xDD);
 
-  // no 2x2 block has determinant bigger than the threshold. So just take the one that
-  // has the biggest determinant
-  absdet.maxCoeff(&good_i);
-  good_row0 = good_i <= 2 ? 0 : good_i <= 4 ? 1 : 2;
-  good_row1 = good_i <= 2 ? good_i+1 : good_i <= 4 ? good_i-1 : 3;
 
-  // now good_row0 and good_row1 are correctly set
-  good:
+    // Cofactors calculation. Because in the process of cofactor computation some pairs in three-
+    // element products are repeated, it is not reasonable to load these pairs anew every time. The
+    // values in the registers with these pairs are formed using shuffle instruction. Cofactors are
+    // calculated row by row (4 elements are placed in 1 SP FP SIMD floating point register).
 
-  // do row permutations to move this 2x2 block to the top
-  matrix.row(0).swap(matrix.row(good_row0));
-  matrix.row(1).swap(matrix.row(good_row1));
-  // now applying our helper function is numerically stable
-  ei_compute_inverse_in_size4_case_helper(matrix, result);
-  // Since we did row permutations on the original matrix, we need to do column permutations
-  // in the reverse order on the inverse
-  result->col(1).swap(result->col(good_row1));
-  result->col(0).swap(result->col(good_row0));
-}
+    tmp1   = _mm_mul_ps(row2, row3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor0  = _mm_mul_ps(row1, tmp1);
+    minor1  = _mm_mul_ps(row0, tmp1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0  = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+    minor1  = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+    minor1  = _mm_shuffle_ps(minor1, minor1, 0x4E);
+    //    -----------------------------------------------
+    tmp1   = _mm_mul_ps(row1, row2);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor0  = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+    minor3  = _mm_mul_ps(row0, tmp1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0  = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+    minor3  = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+    minor3  = _mm_shuffle_ps(minor3, minor3, 0x4E);
+    //    -----------------------------------------------
+    tmp1   = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    row2   = _mm_shuffle_ps(row2, row2, 0x4E);
+    minor0  = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+    minor2  = _mm_mul_ps(row0, tmp1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor0  = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+    minor2  = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+    minor2  = _mm_shuffle_ps(minor2, minor2, 0x4E);
+    //    -----------------------------------------------
+    tmp1   = _mm_mul_ps(row0, row1);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+    minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+    minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+    //           -----------------------------------------------
+    tmp1   = _mm_mul_ps(row0, row3);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+    minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+    minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+    //           -----------------------------------------------
+    tmp1   = _mm_mul_ps(row0, row2);
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+    minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+    minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+    tmp1   = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+    minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+    minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+
+    // Evaluation of determinant and its reciprocal value. In the original Intel document,
+    // 1/det was evaluated using a fast rcpps command with subsequent approximation using
+    // the Newton-Raphson algorithm. Here, we go for a IEEE-compliant division instead,
+    // so as to not compromise precision at all.
+    det    = _mm_mul_ps(row0, minor0);
+    det    = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
+    det    = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
+//     tmp1= _mm_rcp_ss(det);
+//     det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
+    det    = _mm_div_ss(_mm_set_ss(1.0f), det); // <--- yay, one original line not copied from Intel
+    det    = _mm_shuffle_ps(det, det, 0x00);
+    // warning, Intel's variable naming is very confusing: now 'det' is 1/det !
+
+    // Multiplication of cofactors by 1/det. Storing the inverse matrix to the address in pointer src.
+    minor0 = _mm_mul_ps(det, minor0);
+    float *dst = result.data();
+    _mm_storel_pi((__m64*)(dst), minor0);
+    _mm_storeh_pi((__m64*)(dst+2), minor0);
+    minor1 = _mm_mul_ps(det, minor1);
+    _mm_storel_pi((__m64*)(dst+4), minor1);
+    _mm_storeh_pi((__m64*)(dst+6), minor1);
+    minor2 = _mm_mul_ps(det, minor2);
+    _mm_storel_pi((__m64*)(dst+ 8), minor2);
+    _mm_storeh_pi((__m64*)(dst+10), minor2);
+    minor3 = _mm_mul_ps(det, minor3);
+    _mm_storel_pi((__m64*)(dst+12), minor3);
+    _mm_storeh_pi((__m64*)(dst+14), minor3);
+  }
+};
+#endif
 
 /***********************************************
 *** Part 2 : selector and MatrixBase methods ***
@@ -216,7 +272,7 @@ struct ei_compute_inverse<MatrixType, 4>
 {
   static inline void run(const MatrixType& matrix, MatrixType* result)
   {
-    ei_compute_inverse_in_size4_case(matrix, result);
+    ei_compute_inverse_in_size4_case<MatrixType>::run(matrix, *result);
   }
 };
 
diff --git a/thirdparty/Eigen/Eigen/src/Sparse/TriangularSolver.h b/thirdparty/Eigen/Eigen/src/Sparse/TriangularSolver.h
index 8948ae45e..390d0174b 100644
--- a/thirdparty/Eigen/Eigen/src/Sparse/TriangularSolver.h
+++ b/thirdparty/Eigen/Eigen/src/Sparse/TriangularSolver.h
@@ -43,8 +43,11 @@ struct ei_solve_triangular_selector<Lhs,Rhs,LowerTriangular,RowMajor|IsSparse>
         {
           lastVal = it.value();
           lastIndex = it.index();
+          if(lastIndex == i)
+            break;
           tmp -= lastVal * other.coeff(lastIndex,col);
         }
+
         if (Lhs::Flags & UnitDiagBit)
           other.coeffRef(i,col) = tmp;
         else
diff --git a/thirdparty/curveplot/include/curveplot.h b/thirdparty/curveplot/include/curveplot.h
index a0f5c8c79..37300a3f3 100644
--- a/thirdparty/curveplot/include/curveplot.h
+++ b/thirdparty/curveplot/include/curveplot.h
@@ -79,6 +79,16 @@ class CurvePlot
                 double subdivisionThreshold,
                 double startTime,
                 double endTime) const;
+    void renderFaded(const Eigen::Transform3d& modelview,
+                     double nearZ,
+                     double farZ,
+                     const Eigen::Vector3d viewFrustumPlaneNormals[],
+                     double subdivisionThreshold,
+                     double startTime,
+                     double endTime,
+                     const Eigen::Vector4f& color,
+                     double fadeStartTime,
+                     double fadeEndTime) const;
 
     unsigned int lastUsed() const { return m_lastUsed; }
     void setLastUsed(unsigned int lastUsed) { m_lastUsed = lastUsed; }
@@ -89,6 +99,8 @@ class CurvePlot
 
     bool empty() const { return m_samples.empty(); }
 
+    unsigned int sampleCount() const { return m_samples.size(); }
+
  private:
     std::deque<CurvePlotSample> m_samples;
  
diff --git a/thirdparty/curveplot/src/curveplot.cpp b/thirdparty/curveplot/src/curveplot.cpp
index 0a0bb12b0..23fa9e3ec 100644
--- a/thirdparty/curveplot/src/curveplot.cpp
+++ b/thirdparty/curveplot/src/curveplot.cpp
@@ -235,6 +235,25 @@ public:
 #endif
     }
 
+    inline void vertex(const Vector4d& v, const Vector4f& color)
+    {
+#if USE_VERTEX_BUFFER
+        data[currentPosition++] = v.cast<float>();
+        ++currentStripLength;
+        if (currentPosition == capacity)
+        {
+            flush();
+
+            data[0] = v.cast<float>();
+            currentPosition = 1;
+            currentStripLength = 1;
+        }
+#else
+        glColor4fv(color.data());
+        glVertex3dv(v.data());
+#endif
+    }
+
     inline void begin()
     {
 #if !USE_VERTEX_BUFFER
@@ -423,6 +442,86 @@ public:
         return restartCurve;
     }
 
+    // Return the GL restart status: true if the last segment of the
+    // curve was culled and we need to start a new primitive sequence
+    // with glBegin().
+    bool renderCubicFaded(bool restartCurve,
+                          const Matrix4d& coeff,
+                          double t0, double t1,
+                          const Vector4f& color,
+                          double fadeStart, double fadeRate,
+                          double curveBoundingRadius,
+                          int depth) const
+    {
+        const double dt = (t1 - t0) * InvSubdivisionFactor;
+        double segmentBoundingRadius = curveBoundingRadius * InvSubdivisionFactor;
+
+#if DEBUG_ADAPTIVE_SPLINE
+        {
+            int c = depth % 10;
+            glColor4f(SplineColors[c][0], SplineColors[c][1], SplineColors[c][2], 1.0f);
+            ++SegmentCounts[depth];
+        }
+#endif
+
+        Vector4d lastP = coeff * Vector4d(1.0, t0, t0 * t0, t0 * t0 * t0);
+        double lastOpacity = (t0 - fadeStart) * fadeRate;
+        lastOpacity = max(0.0, min(1.0, lastOpacity)); // clamp
+
+        for (unsigned int i = 1; i <= SubdivisionFactor; i++)
+        {
+            double t = t0 + dt * i;
+            Vector4d p = coeff * Vector4d(1.0, t, t * t, t * t * t);
+            double opacity = (t - fadeStart) * fadeRate;
+            opacity = max(0.0, min(1.0, opacity)); // clamp
+
+            double minDistance = max(-m_viewFrustum.nearZ(), abs(p.z()) - segmentBoundingRadius);
+
+            if (segmentBoundingRadius >= m_subdivisionThreshold * minDistance)
+            {
+                if (m_viewFrustum.cullSphere(p, segmentBoundingRadius))
+                {
+                    if (!restartCurve)
+                    {
+                        m_vbuf.end();
+                        restartCurve = true;
+                    }
+                }
+                else
+                {
+                    restartCurve = renderCubicFaded(restartCurve,
+                                                    coeff, t - dt, t,
+                                                    color,
+                                                    fadeStart, fadeRate,
+                                                    segmentBoundingRadius,
+                                                    depth + 1);
+                }
+            }
+            else
+            {
+#if DEBUG_ADAPTIVE_SPLINE
+                {
+                    int c = depth % 10;
+                    glColor4f(SplineColors[c][0], SplineColors[c][1], SplineColors[c][2], i % 2 ? 0.25f : 1.0f);
+                }
+#endif
+
+                if (restartCurve)
+                {
+                    m_vbuf.begin();
+                    m_vbuf.vertex(lastP, Vector4f(color.x(), color.y(), color.z(), color.w() * float(lastOpacity)));
+                    restartCurve = false;
+                }
+
+                m_vbuf.vertex(p, Vector4f(color.x(), color.y(), color.z(), color.w() * float(opacity)));
+            }
+            lastP = p;
+            lastOpacity = opacity;
+        }
+
+        return restartCurve;
+    }
+
 private:
     HighPrec_VertexBuffer& m_vbuf;
     HighPrec_Frustum& m_viewFrustum;
@@ -554,7 +653,6 @@ CurvePlot::render(const Transform3d& modelview,
     vbuf.createVertexBuffer();
     vbuf.setup();
 
-    clog << "size: " << m_samples.size() << endl;
     for (unsigned int i = 1; i < m_samples.size(); i++)
     {
         // Transform the points into camera space.
@@ -805,3 +903,170 @@ CurvePlot::render(const Transform3d& modelview,
     vbuf.flush();
     vbuf.finish();
 }
+
+
+/** Draw a curve with a fade effect. The curve is at full opacity at fadeStartTime
+  * and completely transparent at fadeEndTime. fadeStartTime may be greater than
+  * fadeEndTime--this just means that the fade direction will be reversed.
+  */
+void
+CurvePlot::renderFaded(const Eigen::Transform3d& modelview,
+                       double nearZ,
+                       double farZ,
+                       const Eigen::Vector3d viewFrustumPlaneNormals[],
+                       double subdivisionThreshold,
+                       double startTime,
+                       double endTime,
+                       const Vector4f& color,
+                       double fadeStartTime,
+                       double fadeEndTime) const
+{
+    // Flag to indicate whether we need to issue a glBegin()
+    bool restartCurve = true;
+
+    if (m_samples.empty() || endTime <= m_samples.front().t || startTime >= m_samples.back().t)
+        return;
+
+    // Linear search for the first sample
+    unsigned int startSample = 0;
+    while (startSample < m_samples.size() - 1 && startTime < m_samples[startSample].t)
+        startSample++;
+
+    // Start at the first sample with time <= startTime
+    if (startSample > 0)
+        startSample--;
+
+    double fadeDuration = fadeEndTime - fadeStartTime;
+    double fadeRate = 1.0 / fadeDuration;
+
+    const Vector3d& p0_ = m_samples[startSample].position;
+    const Vector3d& v0_ = m_samples[startSample].velocity;
+    Vector4d p0 = modelview * Vector4d(p0_.x(), p0_.y(), p0_.z(), 1.0);
+    Vector4d v0 = modelview * Vector4d(v0_.x(), v0_.y(), v0_.z(), 0.0);
+    double opacity0 = (m_samples[startSample].t - fadeStartTime) * fadeRate;
+    opacity0 = max(0.0, min(1.0, opacity0));
+
+    HighPrec_Frustum viewFrustum(nearZ, farZ, viewFrustumPlaneNormals);
+    HighPrec_RenderContext rc(vbuf, viewFrustum, subdivisionThreshold);
+
+    vbuf.createVertexBuffer();
+    vbuf.setup();
+
+    bool firstSegment = true;
+    bool lastSegment = false;
+
+    for (unsigned int i = startSample + 1; i < m_samples.size() && !lastSegment; i++)
+    {
+        // Transform the points into camera space.
+        const Vector3d& p1_ = m_samples[i].position;
+        const Vector3d& v1_ = m_samples[i].velocity;
+        Vector4d p1 = modelview * Vector4d(p1_.x(), p1_.y(), p1_.z(), 1.0);
+        Vector4d v1 = modelview * Vector4d(v1_.x(), v1_.y(), v1_.z(), 0.0);
+        double opacity1 = (m_samples[i].t - fadeStartTime) * fadeRate;
+        opacity1 = max(0.0, min(1.0, opacity1));
+
+        if (endTime <= m_samples[i].t)
+        {
+            lastSegment = true;
+        }
+
+        // O(t) is an approximating function for this segment of
+        // the orbit, with 0 <= t <= 1
+        // C is the viewer position
+        // d(t) = |O(t) - C|, the distance from viewer to the
+        // orbit segment.
+
+        double curveBoundingRadius = m_samples[i].boundingRadius;
+
+        // Estimate the minimum possible distance from the
+        // curve to the z=0 plane. If the curve is far enough
+        // away to be approximated as a straight line, we'll just
+        // render it. Otherwise, it should be a performance win
+        // to do a sphere-frustum cull test before subdividing and
+        // rendering segment.
+        double minDistance = abs(p0.z()) - curveBoundingRadius;
+
+        // Render close segments as splines with adaptive subdivision. The
+        // subdivisions eliminates kinks between line segments and also
+        // prevents clipping precision problems that occur when a
+        // very long line is rendered with a relatively small view
+        // volume.
+        if (curveBoundingRadius >= subdivisionThreshold * minDistance || lastSegment || firstSegment)
+        {
+            // Skip rendering this section if it lies outside the view
+            // frustum.
+            if (viewFrustum.cullSphere(p0, curveBoundingRadius))
+            {
+                if (!restartCurve)
+                {
+                    vbuf.end();
+                    restartCurve = true;
+                }
+            }
+            else
+            {
+                double dt = m_samples[i].t - m_samples[i - 1].t;
+                double t0 = 0.0;
+                double t1 = 1.0;
+
+                if (firstSegment)
+                {
+                    t0 = (startTime - m_samples[i - 1].t) / dt;
+                    t0 = std::max(0.0, std::min(1.0, t0));
+                    firstSegment = false;
+                }
+
+                if (lastSegment)
+                {
+                    t1 = (endTime - m_samples[i - 1].t) / dt;
+                }
+
+                Matrix4d coeff = cubicHermiteCoefficients(p0, p1, v0 * dt, v1 * dt);
+                restartCurve = rc.renderCubicFaded(restartCurve, coeff,
+                                                   t0, t1,
+                                                   color,
+                                                   (fadeStartTime - m_samples[i - 1].t) / dt, fadeRate * dt,
+                                                   curveBoundingRadius, 1);
+            }
+        }
+        else
+        {
+            // Apparent size of curve is small enough that we can approximate
+            // it as a line.
+
+            // Simple cull test--just check the far plane. This is required because
+            // apparent clipping precision limitations can cause a GPU to draw lines
+            // that lie completely beyond the far plane.
+            if (p0.z() + curveBoundingRadius < farZ)
+            {
+                if (!restartCurve)
+                {
+                    vbuf.end();
+                    restartCurve = true;
+                }
+            }
+            else
+            {
+                if (restartCurve)
+                {
+                    vbuf.begin();
+                    vbuf.vertex(p0, Vector4f(color.x(), color.y(), color.z(), color.w() * float(opacity0)));
+                    restartCurve = false;
+                }
+                vbuf.vertex(p1, Vector4f(color.x(), color.y(), color.z(), color.w() * float(opacity1)));
+            }
+        }
+
+        p0 = p1;
+        v0 = v1;
+        opacity0 = opacity1;
+    }
+
+    if (!restartCurve)
+    {
+        vbuf.end();
+    }
+
+    vbuf.flush();
+    vbuf.finish();
+}