A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dataframe.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_CONTAINERS_DATAFRAME_H_
2 #define GENESIS_UTILS_CONTAINERS_DATAFRAME_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
35 
36 #include <cassert>
37 #include <cstddef>
38 #include <iterator>
39 #include <stdexcept>
40 #include <string>
41 #include <type_traits>
42 #include <unordered_map>
43 #include <utility>
44 #include <vector>
45 
46 namespace genesis {
47 namespace utils {
48 
49 // =================================================================================================
50 // Data Frame
51 // =================================================================================================
52 
56 template <typename T>
57 class Dataframe
58 {
59 public:
60 
61  // ---------------------------------------------------------------------------------------------
62  // Column Class
63  // ---------------------------------------------------------------------------------------------
64 
68  class Column
69  {
70  public:
71 
72  // -------------------------------------------------------------------------
73  // Member Types
74  // -------------------------------------------------------------------------
75 
76  using self_type = Column;
77  using value_type = T;
78 
80  using const_reference = const value_type&;
81  using pointer = value_type*;
82  using const_pointer = const value_type*;
83 
84  using iterator = typename std::vector< value_type >::iterator;
85  using const_iterator = typename std::vector< value_type >::const_iterator;
86 
87  using size_type = size_t;
88 
89  // -------------------------------------------------------------------------
90  // Constructor and Rule of Five
91  // -------------------------------------------------------------------------
92 
93  friend class Dataframe;
94 
96  : df_( &df )
97  , index_( index )
98  {}
99 
100  ~Column() = default;
101 
102  Column( Column const& ) = default;
103  Column( Column&& ) = default;
104 
105  Column& operator= ( Column const& other )
106  {
107  // Either we assign when constructing (thanks, STL), in which case the assigned-to
108  // column has size 0, or we do so when moving the vector contents, in which case
109  // the size needs to stay the same.
110  if( content_.size() > 0 && content_.size() != other.size() ) {
111  throw std::runtime_error(
112  "Cannot assign Dataframe column with different size."
113  );
114  }
115 
116  content_ = other.content_;
117  return *this;
118  }
119 
121  {
122  // Either we assign when constructing (thanks, STL), in which case the assigned-to
123  // column has size 0, or we do so when moving the vector contents, in which case
124  // the size needs to stay the same.
125  if( content_.size() > 0 && content_.size() != other.size() ) {
126  throw std::runtime_error(
127  "Cannot assign Dataframe column with different size."
128  );
129  }
130 
131  content_ = std::move( other.content_ );
132  return *this;
133  }
134 
135  // -------------------------------------------------------------------------
136  // Iterators
137  // -------------------------------------------------------------------------
138 
139  public:
140 
142  {
143  return content_.begin();
144  }
145 
147  {
148  return content_.cbegin();
149  }
150 
152  {
153  return content_.end();
154  }
155 
157  {
158  return content_.cend();
159  }
160 
162  {
163  return content_.cbegin();
164  }
165 
167  {
168  return content_.cend();
169  }
170 
171  // -------------------------------------------------------------------------
172  // Properties
173  // -------------------------------------------------------------------------
174 
176  {
177  return *df_;
178  }
179 
180  Dataframe const& dataframe() const
181  {
182  return *df_;
183  }
184 
185  size_type size() const
186  {
187  return content_.size();
188  }
189 
190  bool empty() const
191  {
192  return content_.empty();
193  }
194 
195  size_type index() const
196  {
197  return index_;
198  }
199 
200  std::string const& name() const
201  {
202  return df_->col_name( index_ );
203  }
204 
205  // -------------------------------------------------------------------------
206  // Element Access
207  // -------------------------------------------------------------------------
208 
210  {
211  return content_[ index ];
212  }
213 
215  {
216  return content_[ index ];
217  }
218 
219  reference operator[] ( std::string const& row_name )
220  {
221  return content_[ df_->row_index( row_name ) ];
222  }
223 
224  const_reference operator[] ( std::string const& row_name ) const
225  {
226  return content_[ df_->row_index( row_name ) ];
227  }
228 
230  {
231  return content_.at( index );
232  }
233 
235  {
236  return content_.at( index );
237  }
238 
239  reference at( std::string const& row_name )
240  {
241  return content_[ df_->row_index( row_name ) ];
242  }
243 
244  const_reference at( std::string const& row_name ) const
245  {
246  return content_[ df_->row_index( row_name ) ];
247  }
248 
249  // -------------------------------------------------------------------------
250  // Modifiers
251  // -------------------------------------------------------------------------
252 
258  self_type& operator = ( std::vector<value_type> const& vec )
259  {
260  if( vec.size() != content_.size() ) {
261  throw std::runtime_error(
262  "Cannot assign vector with different size to Dataframe column."
263  );
264  }
265 
266  content_ = vec;
267  return *this;
268  }
269 
270  // -------------------------------------------------------------------------
271  // Data Members
272  // -------------------------------------------------------------------------
273 
274  private:
275 
276  Dataframe* df_ = nullptr;
277  size_type index_;
278 
279  std::vector< value_type > content_;
280 
281 
282  };
283 
284  // ---------------------------------------------------------------------------------------------
285  // Member Types
286  // ---------------------------------------------------------------------------------------------
287 
290 
292  using const_reference = const value_type&;
293  using pointer = value_type*;
294  using const_pointer = const value_type*;
295 
296  using iterator = typename std::vector< value_type >::iterator;
297  using const_iterator = typename std::vector< value_type >::const_iterator;
298 
299  using size_type = size_t;
300 
301  // ---------------------------------------------------------------------------------------------
302  // Constructor and Rule of Five
303  // ---------------------------------------------------------------------------------------------
304 
305  friend class Column;
306 
307  Dataframe() = default;
308  ~Dataframe() = default;
309 
310  Dataframe( Dataframe const& ) = default;
311  Dataframe ( Dataframe&& ) = default;
312 
313  Dataframe& operator= ( Dataframe const& ) = default;
314  Dataframe& operator= ( Dataframe&& ) = default;
315 
316  // ---------------------------------------------------------------------------------------------
317  // Iterators
318  // ---------------------------------------------------------------------------------------------
319 
321  {
322  return columns_.begin();
323  }
324 
326  {
327  return columns_.cbegin();
328  }
329 
331  {
332  return columns_.end();
333  }
334 
336  {
337  return columns_.cend();
338  }
339 
341  {
342  return columns_.cbegin();
343  }
344 
346  {
347  return columns_.cend();
348  }
349 
350  // ---------------------------------------------------------------------------------------------
351  // Properties
352  // ---------------------------------------------------------------------------------------------
353 
354  size_type rows() const
355  {
356  return row_names_.size();
357  }
358 
359  size_type cols() const
360  {
361  return columns_.size();
362  }
363 
364  bool empty() const
365  {
366  return columns_.empty() && row_names_.empty();
367  }
368 
369  // ---------------------------------------------------------------------------------------------
370  // Column Access
371  // ---------------------------------------------------------------------------------------------
372 
374  {
375  return columns_.at( col_index );
376  }
377 
379  {
380  return columns_.at( col_index );
381  }
382 
383  reference operator[] ( std::string const& col_name )
384  {
385  return columns_[ col_index( col_name ) ];
386  }
387 
388  const_reference operator[] ( std::string const& col_name ) const
389  {
390  return columns_[ col_index( col_name ) ];
391  }
392 
394  {
395  return columns_.at( col_index );
396  }
397 
399  {
400  return columns_.at( col_index );
401  }
402 
403  reference at( std::string const& col_name )
404  {
405  return columns_[ col_index( col_name ) ];
406  }
407 
408  const_reference at( std::string const& col_name ) const
409  {
410  return columns_[ col_index( col_name ) ];
411  }
412 
413  // ---------------------------------------------------------------------------------------------
414  // Element Access
415  // ---------------------------------------------------------------------------------------------
416 
418  {
419  return at( col_index ).at( row_index );
420  }
421 
423  {
424  return at( col_index ).at( row_index );
425  }
426 
428  {
429  return at( col_index ).at( row_name );
430  }
431 
432  typename Column::const_reference operator () ( std::string const& row_name, size_type col_index ) const
433  {
434  return at( col_index ).at( row_name );
435  }
436 
438  {
439  return at( col_name ).at( row_index );
440  }
441 
442  typename Column::const_reference operator () ( size_type row_index, std::string const& col_name ) const
443  {
444  return at( col_name ).at( row_index );
445  }
446 
447  typename Column::reference operator () ( std::string const& row_name, std::string const& col_name )
448  {
449  return at( col_name ).at( row_name );
450  }
451 
452  typename Column::const_reference operator () ( std::string const& row_name, std::string const& col_name ) const
453  {
454  return at( col_name ).at( row_name );
455  }
456 
457  // ---------------------------------------------------------------------------------------------
458  // Indexing and Naming
459  // ---------------------------------------------------------------------------------------------
460 
461  size_t row_index( std::string const& row_name ) const
462  {
463  return row_lookup_.at( row_name );
464  }
465 
466  std::string const& row_name( size_type row_index ) const
467  {
468  return row_names_.at( row_index );
469  }
470 
471  self_type& row_name( size_type row_index, std::string const& value )
472  {
473  auto const& old = row_names_.at( row_index );
474  row_lookup_.erase( old );
475  row_lookup_[ value ] = row_index;
476  row_names_.at( row_index ) = value;
477 
478  return *this;
479  }
480 
481  std::vector<std::string> const& row_names() const
482  {
483  return row_names_;
484  }
485 
486  size_t col_index( std::string const& col_name ) const
487  {
488  return col_lookup_.at( col_name );
489  }
490 
491  std::string const& col_name( size_type col_index ) const
492  {
493  return col_names_.at( col_index );
494  }
495 
496  self_type& col_name( size_type col_index, std::string const& value )
497  {
498  auto const& old = col_names_.at( col_index );
499  col_lookup_.erase( old );
500  col_lookup_[ value ] = col_index;
501  col_names_.at( col_index ) = value;
502 
503  return *this;
504  }
505 
506  std::vector<std::string> col_names() const
507  {
508  return col_names_;
509  }
510 
511  // ---------------------------------------------------------------------------------------------
512  // Adding rows and cols
513  // ---------------------------------------------------------------------------------------------
514 
516  {
517  auto const index = columns_.size();
518  columns_.emplace_back( *this, index );
519  columns_.back().content_.resize( row_names_.size() );
520  col_names_.emplace_back();
521 
522  return *this;
523  }
524 
525  self_type& add_col( std::string const& name )
526  {
527  if( col_lookup_.count( name ) > 0 ) {
528  throw std::runtime_error( "Column with name " + name + " already exists in Dataframe." );
529  }
530 
531  auto const index = columns_.size();
532  columns_.emplace_back( *this, index );
533  columns_.back().content_.resize( row_names_.size() );
534  col_names_.emplace_back( name );
535  col_lookup_[ name ] = index;
536 
537  return *this;
538  }
539 
541  {
542  row_names_.emplace_back();
543 
544  for( auto& col : columns_ ) {
545  col.content_.emplace_back();
546  }
547 
548  return *this;
549  }
550 
551  self_type& add_row( std::string const& name )
552  {
553  // Add name.
554  if( row_lookup_.count( name ) > 0 ) {
555  throw std::runtime_error( "Row with name " + name + " already exists in Dataframe." );
556  }
557  row_names_.emplace_back( name );
558  row_lookup_[ name ] = row_names_.size() - 1;
559 
560  // Add content.
561  for( auto& col : columns_ ) {
562  col.content_.emplace_back();
563  }
564 
565  return *this;
566  }
567 
568  // ---------------------------------------------------------------------------------------------
569  // Removing rows and cols
570  // ---------------------------------------------------------------------------------------------
571 
573  {
574  columns_.clear();
575  row_names_.clear();
576  col_names_.clear();
577  row_lookup_.clear();
578  col_lookup_.clear();
579  return *this;
580  }
581 
583  {
584  for( auto& col : columns_ ) {
585  col.content_.clear();
586  }
587  row_names_.clear();
588  row_lookup_.clear();
589  return *this;
590  }
591 
593  {
594  columns_.clear();
595  col_names_.clear();
596  col_lookup_.clear();
597  return *this;
598  }
599 
601  {
602  assert( columns_.size() == col_names_.size() );
603  if( col_index >= columns_.size() ) {
604  throw std::runtime_error( "Invalid column index greater than number of columns." );
605  }
606 
607  // Remove elements.
608  auto const name = col_names_[ col_index ];
609  columns_.erase( columns_.begin() + col_index );
610  col_names_.erase( col_names_.begin() + col_index );
611  col_lookup_.erase( name );
612 
613  // Adjust remaining indices.
614  for( size_t i = 0; i < columns_.size(); ++i ) {
615  // --columns_[i].index_;
616 
617  // We do not need to adjust the indices, as the erase uses the assignment operators,
618  // which already make sure that the indices stay correct (simply by not changing them).
619  // So here, we simply control whether the indices of the columns are intact.
620  assert( columns_[i].index_ == i );
621  }
622 
623  // Adjust indices of all lookup table values that are greater than the removed index.
624  for( auto& le : col_lookup_ ) {
625  assert( le.second != col_index );
626  if( le.second > col_index ) {
627  --le.second;
628  }
629  }
630 
631  return *this;
632  }
633 
634  self_type& remove_col( std::string const& col_name )
635  {
636  auto const index = col_index( col_name );
637  assert( col_names_[ index ] == col_name );
638  remove_col( index );
639  return *this;
640  }
641 
643  {
644  if( row_index >= row_names_.size() ) {
645  throw std::runtime_error( "Invalid row index greater than number of rows." );
646  }
647 
648  // Remove elements.
649  for( auto& col : columns_ ) {
650  assert( col.content_.size() == row_names_.size() );
651  col.content_.erase( col.content_.begin() + row_index );
652  }
653  auto const name = row_names_[ row_index ];
654  row_names_.erase( row_names_.begin() + row_index );
655  row_lookup_.erase( name );
656 
657  // Adjust remaining indices.
658  for( auto& le : row_lookup_ ) {
659  assert( le.second != row_index );
660  if( le.second > row_index ) {
661  --le.second;
662  }
663  }
664 
665  return *this;
666  }
667 
668  self_type& remove_row( std::string const& row_name )
669  {
670  auto const index = row_index( row_name );
671  assert( row_names_[ index ] == row_name );
672  remove_row( index );
673  return *this;
674  }
675 
676  // ---------------------------------------------------------------------------------------------
677  // Data Members
678  // ---------------------------------------------------------------------------------------------
679 
680 private:
681 
682  std::vector< value_type > columns_;
683 
684  std::vector< std::string > row_names_;
685  std::vector< std::string > col_names_;
686 
687  std::unordered_map< std::string, size_t > row_lookup_;
688  std::unordered_map< std::string, size_t > col_lookup_;
689 
690 };
691 
692 } // namespace utils
693 } // namespace genesis
694 
695 #endif // include guard
self_type & remove_col(size_type col_index)
Definition: dataframe.hpp:600
self_type & clear_cols()
Definition: dataframe.hpp:592
self_type & remove_row(size_type row_index)
Definition: dataframe.hpp:642
reference at(size_type col_index)
Definition: dataframe.hpp:393
const_reference at(size_type col_index) const
Definition: dataframe.hpp:398
const_iterator begin() const
Definition: dataframe.hpp:325
self_type & col_name(size_type col_index, std::string const &value)
Definition: dataframe.hpp:496
reference at(std::string const &row_name)
Definition: dataframe.hpp:239
Column::reference operator()(size_type row_index, size_type col_index)
Definition: dataframe.hpp:417
typename std::vector< value_type >::iterator iterator
Definition: dataframe.hpp:84
const_iterator end() const
Definition: dataframe.hpp:335
size_t col_index(std::string const &col_name) const
Definition: dataframe.hpp:486
reference operator[](size_type col_index)
Definition: dataframe.hpp:373
typename std::vector< value_type >::const_iterator const_iterator
Definition: dataframe.hpp:85
typename std::vector< value_type >::iterator iterator
Definition: dataframe.hpp:296
Column & operator=(Column const &other)
Definition: dataframe.hpp:105
const_iterator end() const
Definition: dataframe.hpp:156
const_reference at(size_type index) const
Definition: dataframe.hpp:234
size_t row_index(std::string const &row_name) const
Definition: dataframe.hpp:461
const value_type & const_reference
Definition: dataframe.hpp:80
Provides some valuable additions to STD.
const_reference at(std::string const &col_name) const
Definition: dataframe.hpp:408
reference at(size_type index)
Definition: dataframe.hpp:229
self_type & add_col(std::string const &name)
Definition: dataframe.hpp:525
const_iterator begin() const
Definition: dataframe.hpp:146
std::string const & row_name(size_type row_index) const
Definition: dataframe.hpp:466
Dataframe const & dataframe() const
Definition: dataframe.hpp:180
self_type & remove_row(std::string const &row_name)
Definition: dataframe.hpp:668
size_type rows() const
Definition: dataframe.hpp:354
std::vector< std::string > col_names() const
Definition: dataframe.hpp:506
const value_type * const_pointer
Definition: dataframe.hpp:82
Dataframe & operator=(Dataframe const &)=default
reference operator[](size_type index)
Definition: dataframe.hpp:209
Column(Dataframe &df, size_type index)
Definition: dataframe.hpp:95
std::string const & name() const
Definition: dataframe.hpp:200
const_iterator cbegin()
Definition: dataframe.hpp:340
self_type & add_row(std::string const &name)
Definition: dataframe.hpp:551
size_type cols() const
Definition: dataframe.hpp:359
const_reference at(std::string const &row_name) const
Definition: dataframe.hpp:244
self_type & clear_rows()
Definition: dataframe.hpp:582
std::string const & col_name(size_type col_index) const
Definition: dataframe.hpp:491
const_iterator cend()
Definition: dataframe.hpp:345
reference at(std::string const &col_name)
Definition: dataframe.hpp:403
self_type & remove_col(std::string const &col_name)
Definition: dataframe.hpp:634
typename std::vector< value_type >::const_iterator const_iterator
Definition: dataframe.hpp:297
std::vector< std::string > const & row_names() const
Definition: dataframe.hpp:481
self_type & row_name(size_type row_index, std::string const &value)
Definition: dataframe.hpp:471