C++ algorithm like python's 'groupby'

11,597

Solution 1

This doesn't really answer your question, but for the fun of it, I implemented a group_by iterator. Maybe someone will find it useful:

#include <assert.h>
#include <iostream>
#include <set>
#include <sstream>
#include <string>
#include <vector>

using std::cout;
using std::cerr;
using std::multiset;
using std::ostringstream;
using std::pair;
using std::vector;

struct Foo
{
  int x;
  std::string y;
  float z;
};

struct FooX {
  typedef int value_type;
  value_type operator()(const Foo &f) const { return f.x; }
};



template <typename Iterator,typename KeyFunc>
struct GroupBy {
  typedef typename KeyFunc::value_type KeyValue;

  struct Range {
    Range(Iterator begin,Iterator end)
    : iter_pair(begin,end)
    {
    }

    Iterator begin() const { return iter_pair.first; }
    Iterator end() const { return iter_pair.second; }

    private:
      pair<Iterator,Iterator> iter_pair;
  };

  struct Group {
    KeyValue value;
    Range range;

    Group(KeyValue value,Range range)
    : value(value), range(range)
    {
    }
  };


  struct GroupIterator {
    typedef Group value_type;

    GroupIterator(Iterator iter,Iterator end,KeyFunc key_func)
    : range_begin(iter), range_end(iter), end(end), key_func(key_func)
    {
      advance_range_end();
    }

    bool operator==(const GroupIterator &that) const
    {
      return range_begin==that.range_begin;
    }

    bool operator!=(const GroupIterator &that) const
    {
      return !(*this==that);
    }

    GroupIterator operator++()
    {
      range_begin = range_end;
      advance_range_end();
      return *this;
    }

    value_type operator*() const
    {
      return value_type(key_func(*range_begin),Range(range_begin,range_end));
    }


    private:
      void advance_range_end()
      {
        if (range_end!=end) {
          typename KeyFunc::value_type value = key_func(*range_end++);
          while (range_end!=end && key_func(*range_end)==value) {
            ++range_end;
          }
        }
      }

      Iterator range_begin;
      Iterator range_end;
      Iterator end;
      KeyFunc key_func;
  };

  GroupBy(Iterator begin_iter,Iterator end_iter,KeyFunc key_func)
  : begin_iter(begin_iter),
    end_iter(end_iter),
    key_func(key_func)
  {
  }

  GroupIterator begin() { return GroupIterator(begin_iter,end_iter,key_func); }

  GroupIterator end() { return GroupIterator(end_iter,end_iter,key_func); }

  private:
    Iterator begin_iter;
    Iterator end_iter;
    KeyFunc key_func;
};


template <typename Iterator,typename KeyFunc>
inline GroupBy<Iterator,KeyFunc>
  group_by(
    Iterator begin,
    Iterator end,
    const KeyFunc &key_func = KeyFunc()
  )
{
  return GroupBy<Iterator,KeyFunc>(begin,end,key_func);
}


static void test()
{
  vector<Foo> foos;
  foos.push_back({5,"bill",2.1});
  foos.push_back({5,"rick",3.7});
  foos.push_back({3,"tom",2.5});
  foos.push_back({7,"joe",3.4});
  foos.push_back({5,"bob",7.2});

  ostringstream out;

  for (auto group : group_by(foos.begin(),foos.end(),FooX())) {
    out << group.value << ":";
    for (auto elem : group.range) {
      out << " " << elem.y;
    }
    out << "\n";
  }

  assert(out.str()==
    "5: bill rick\n"
    "3: tom\n"
    "7: joe\n"
    "5: bob\n"
  );
}

int main(int argc,char **argv)
{
  test();
  return 0;
}

Solution 2

Eric Niebler's ranges library provides a group_by view.

according to the docs it is a header only library and can be included easily.

It's supposed to go into the standard C++ space, but can be used with a recent C++11 compiler.

minimal working example:

#include <map>
#include <vector>
#include <range/v3/all.hpp>
using namespace std;
using namespace ranges;

int main(int argc, char **argv) {
    vector<int> l { 0,1,2,3,6,5,4,7,8,9 };
    ranges::v3::sort(l);
    auto x = l | view::group_by([](int x, int y) { return x / 5 == y / 5; });
    map<int, vector<int>> res;

    auto i = x.begin();
    auto e = x.end();
    for (;i != e; ++i) {
      auto first = *((*i).begin());
      res[first / 5] = to_vector(*i);
    }

    // res = { 0 : [0,1,2,3,4], 1: [5,6,7,8,9] }
}

(I compiled this with clang 3.9.0. and --std=c++11)

Solution 3

I recently discovered cppitertools.

It fulfills this need exactly as described.

https://github.com/ryanhaining/cppitertools#groupby

Solution 4

How about this?

template <typename StructType, typename FieldSelectorUnaryFn>
auto GroupBy(const std::vector<StructType>& instances, const FieldSelectorUnaryFn& fieldChooser)
{
    StructType _;
    using FieldType = decltype(fieldChooser(_));
    std::map<FieldType, std::vector<StructType>> instancesByField;
    for (auto& instance : instances)
    {
        instancesByField[fieldChooser(instance)].push_back(instance);
    }
    return instancesByField;
}

and use it like this:

auto itemsByX = GroupBy(items, [](const auto& item){ return item.x; });

Solution 5

I wrote a C++ library to address this problem in an elegant way. Given your struct

struct foo
{
    int x;
    std::string y;
    float z;
};

To group by y you simply do:

std::vector<foo> dataframe;
...
auto groups = group_by(dataframe, &foo::y);

You can also group by more than one variable:

auto groups = group_by(dataframe, &foo::y, &foo::x);

And then iterate through the groups normally:

for(auto& [key, group]: groups)
{
    // do something
}

It also has other operations such as: subset, concat, and others.

Share:
11,597
Brian Cain
Author by

Brian Cain

I enjoy programming on linux and otherwise. python's pretty awesome, but the time spent in my career is dominated by programming in C++. Resume: https://www.dropbox.com/s/jz7rg0w7i6vgsyw/resume.pdf

Updated on June 24, 2022

Comments

  • Brian Cain
    Brian Cain almost 2 years

    Are there any C++ transformations which are similar to itertools.groupby()?

    Of course I could easily write my own, but I'd prefer to leverage the idiomatic behavior or compose one from the features provided by the STL or boost.

    #include <cstdlib>
    #include <map>
    #include <algorithm>
    #include <string>
    #include <vector>
    
    struct foo
    {
            int x;
            std::string y;
            float z;
    };
    
    bool lt_by_x(const foo &a, const foo &b)
    {
            return a.x < b.x;
    }
    
    void list_by_x(const std::vector<foo> &foos, std::map<int, std::vector<foo> > &foos_by_x)
    {
            /* ideas..? */
    }
    
    int main(int argc, const char *argv[])
    {
            std::vector<foo> foos;
            std::map<int, std::vector<foo> > foos_by_x;
    
            std::vector<foo> sorted_foos;
            std::sort(foos.begin(), foos.end(), lt_by_x);
            list_by_x(sorted_foos, foos_by_x);
    
            return EXIT_SUCCESS;
    }
    
  • kfmfe04
    kfmfe04 over 11 years
    +1 ...even though I would've expected 5: bob to go with 5: bill rick - this is fixable
  • Ryan Haining
    Ryan Haining over 9 years
    c++14 master: coming 2015
  • Alexander Oh
    Alexander Oh almost 8 years
  • NetMage
    NetMage over 6 years
    Doesn't seem like a great implementation compared to how SQL (or LINQ) works.
  • user1633272
    user1633272 almost 4 years
    view::group_by can only group by true and false.
  • Marc Dirven
    Marc Dirven over 3 years
    The while loop can be changed into a std::find_if