Most efficient way to loop through an Eigen matrix

27,254

Solution 1

Eigen allocates matrices in column-major (Fortran) order by default (documentation).

The fastest way to iterate over a matrix is in storage order, doing it the wrong way round will increase the number of cache misses (which if your matrix doesn't fit in L1 will dominate your computation time, so read increase your computation time) by a factor of cacheline/elemsize (probably 64/8=8).

If your matrix fits into L1 cache this won't make a difference, but a good compiler should be able to vectorise the loop, which with AVX enabled (on a shiny new core i7) could give you a speedup of as much as 4 times. (256 bits / 64 bits).

Finally don't expect any of Eigen's built-in functions to give you a speed-up (I don't think there are iterators anyhow, but I may be mistaken), they're just going to give you the same (very simple) code.

TLDR: Swap your order of iteration, you need to vary the row index most quickly.

Solution 2

I did some benchmarks to checkout which way is quicker, I got the following results (in seconds):

12
30
3
6
23
3

The first line is doing iteration as suggested by @jleahy. The second line is doing iteration as I've done in my code in the question (the inverse order of @jleahy). The third line is doing iteration using PlainObjectBase::data()like this for (int i = 0; i < matrixObject.size(); i++). The other 3 lines repeat the same as the above, but with an temporary as suggest by @lucas92

I also did the same tests but using substituting /if else.*/ for /else/ (no special treatment for sparse matrix) and I got the following (in seconds):

10
27
3
6
24
2

Doing the tests again gave me results pretty similar. I used g++ 4.7.3 with -O3. The code:

#include <ctime>
#include <iostream>
#include <Eigen/Dense>

using namespace std;

 template <typename T, int R, int C>
    inline T sum_kahan1(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
      for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
      for (size_t j = 0; j < nRows; ++j)
      {
          if (xs(j,i)>0)
          {
            yP = xs(j,i) - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else if (xs(j,i)<0)
          {
            yN = xs(j,i) - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }

 template <typename T, int R, int C>
    inline T sum_kahan2(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
      for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
      for (size_t j = 0; j < nCols; ++j)
      {
          if (xs(i,j)>0)
          {
            yP = xs(i,j) - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else if (xs(i,j)<0)
          {
            yN = xs(i,j) - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }


 template <typename T, int R, int C>
    inline T sum_kahan3(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
    for (size_t i = 0, size = xs.size(); i < size; i++)
      {
          if ((*(xs.data() + i))>0)
          {
            yP = (*(xs.data() + i)) - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else if ((*(xs.data() + i))<0)
          {
            yN = (*(xs.data() + i)) - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }

 template <typename T, int R, int C>
    inline T sum_kahan1t(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
      for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
      for (size_t j = 0; j < nRows; ++j)
      {
      T temporary = xs(j,i);
          if (temporary>0)
          {
            yP = temporary - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else if (temporary<0)
          {
            yN = temporary - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }

 template <typename T, int R, int C>
    inline T sum_kahan2t(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
      for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
      for (size_t j = 0; j < nCols; ++j)
      {
      T temporary = xs(i,j);
          if (temporary>0)
          {
            yP = temporary - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else if (temporary<0)
          {
            yN = temporary - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }


 template <typename T, int R, int C>
    inline T sum_kahan3t(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
    for (size_t i = 0, size = xs.size(); i < size; i++)
      {
      T temporary = (*(xs.data() + i));
          if (temporary>0)
          {
            yP = temporary - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else if (temporary<0)
          {
            yN = temporary - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }

 template <typename T, int R, int C>
    inline T sum_kahan1e(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
      for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
      for (size_t j = 0; j < nRows; ++j)
      {
          if (xs(j,i)>0)
          {
            yP = xs(j,i) - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else
          {
            yN = xs(j,i) - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }

 template <typename T, int R, int C>
    inline T sum_kahan2e(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
      for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
      for (size_t j = 0; j < nCols; ++j)
      {
          if (xs(i,j)>0)
          {
            yP = xs(i,j) - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else
          {
            yN = xs(i,j) - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }


 template <typename T, int R, int C>
    inline T sum_kahan3e(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
    for (size_t i = 0, size = xs.size(); i < size; i++)
      {
          if ((*(xs.data() + i))>0)
          {
            yP = (*(xs.data() + i)) - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else
          {
            yN = (*(xs.data() + i)) - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }

 template <typename T, int R, int C>
    inline T sum_kahan1te(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
      for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
      for (size_t j = 0; j < nRows; ++j)
      {
      T temporary = xs(j,i);
          if (temporary>0)
          {
            yP = temporary - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else
          {
            yN = temporary - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }

 template <typename T, int R, int C>
    inline T sum_kahan2te(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
      for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
      for (size_t j = 0; j < nCols; ++j)
      {
      T temporary = xs(i,j);
          if (temporary>0)
          {
            yP = temporary - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else
          {
            yN = temporary - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }


 template <typename T, int R, int C>
    inline T sum_kahan3te(const Eigen::Matrix<T,R,C>& xs) {
      if (xs.size() == 0) return 0;
      T sumP(0);
      T sumN(0);
      T tP(0);
      T tN(0);
      T cP(0);
      T cN(0);
      T yP(0);
      T yN(0);
    for (size_t i = 0, size = xs.size(); i < size; i++)
      {
      T temporary = (*(xs.data() + i));
          if (temporary>0)
          {
            yP = temporary - cP;
          tP = sumP + yP;
          cP = (tP - sumP) - yP;
          sumP = tP;
          }
        else
          {
            yN = temporary - cN;
          tN = sumN + yN;
          cN = (tN - sumN) - yN;
          sumN = tN;
          }
      }
      return sumP+sumN;
    }


int main() {

    Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> test = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>::Random(10000,10000);

    cout << "start" << endl;   
    int now;

    now = time(0);
    sum_kahan1(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan2(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan3(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan1t(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan2t(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan3t(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan1e(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan2e(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan3e(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan1te(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan2te(test);
    cout << time(0) - now << endl;

    now = time(0);
    sum_kahan3te(test);
    cout << time(0) - now << endl;

    return 0;
}

Solution 3

I notice that the code is equivalent to the sum of all the entries in the matrix, i.e., you could just do this:

return xs.sum();

I would assume that would perform better since it's only a single pass, and furthermore Eigen ought to "know" how to arrange the passes for optimum performance.

If, however, you want to retain the two passes, you could express this using the coefficient-wise reduction mechanisms, like this:

return (xs.array() > 0).select(xs, 0).sum() +
       (xs.array() < 0).select(xs, 0).sum();

which uses the boolean coefficient-wise select to pick the positive and negative entries. I don't know whether it would outperform the hand-rolled loops, but in theory coding it this way allows Eigen (and the compiler) to know more about your intention, and possibly improve the results.

Solution 4

Try to store xs(i,j) inside a temporary variable inside the loop so you only call the function once.

Share:
27,254
random_user
Author by

random_user

random_user is random person that came from a random place at random time.

Updated on July 05, 2022

Comments

  • random_user
    random_user almost 2 years

    I'm creating some functions to do things like the "separated sum" of negative and positive number, kahan, pairwise and other stuff where it doesn't matter the order I take the elements from the matrix, for example:

    template <typename T, int R, int C>
    inline T sum(const Eigen::Matrix<T,R,C>& xs)
    {
      T sumP(0);
      T sumN(0);
      for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
       for (size_t j = 0; j < nCols; ++j)
       {
            if (xs(i,j)>0)
              sumP += xs(i,j);
            else if (xs(i,j)<0) //ignore 0 elements: improvement for sparse matrices I think
              sumN += xs(i,j);
       }
     return sumP+sumN;
    }
    

    Now, I would like to make this as efficient as possible, so my question is, would it be better to loop through each column of each row like the above, or do the opposite like the the following:

    for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
      for (size_t j = 0; j < nRows; ++j)
    

    (I suppose this depends on the order that the matrix elements are allocated in memory, but I couldn't find this in Eigen's manual).

    Also, are there other alternate ways like using iterators (do they exist in Eigen?) that might be slightly faster?