A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utils/containers/dataframe/reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_CONTAINERS_DATAFRAME_READER_H_
2 #define GENESIS_UTILS_CONTAINERS_DATAFRAME_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
38 
39 #include <functional>
40 #include <stdexcept>
41 #include <sstream>
42 #include <string>
43 #include <vector>
44 
45 namespace genesis {
46 namespace utils {
47 
48 // =================================================================================================
49 // DataframeReader
50 // =================================================================================================
51 
52 template <typename T>
54 {
55 public:
56 
57  // -------------------------------------------------------------
58  // Constructors and Rule of Five
59  // -------------------------------------------------------------
60 
61  DataframeReader( char separator_char = ',' )
62  {
63  reader_.separator_chars( std::string( 1, separator_char ));
64  }
65 
66  DataframeReader( CsvReader const& reader )
67  : reader_( reader )
68  {}
69 
70  ~DataframeReader() = default;
71 
72  DataframeReader(DataframeReader const&) = default;
73  DataframeReader(DataframeReader&&) = default;
74 
75  DataframeReader& operator= (DataframeReader const&) = default;
77 
78  // -------------------------------------------------------------
79  // Reading
80  // -------------------------------------------------------------
81 
82  Dataframe<T> from_stream( std::istream& is ) const
83  {
84  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( is ));
85  return parse_( it );
86  }
87 
88  Dataframe<T> from_file ( std::string const& fn ) const
89  {
90  utils::InputStream it( utils::make_unique< utils::FileInputSource >( fn ));
91  return parse_( it );
92  }
93 
94  Dataframe<T> from_string( std::string const& fs ) const
95  {
96  utils::InputStream it( utils::make_unique< utils::StringInputSource >( fs ));
97  return parse_( it );
98  }
99 
100  // -------------------------------------------------------------
101  // Properties
102  // -------------------------------------------------------------
103 
104  bool names_from_first_row() const
105  {
106  return names_from_first_row_;
107  }
108 
109  bool names_from_first_col() const
110  {
111  return names_from_first_col_;
112  }
113 
115  {
116  names_from_first_row_ = value;
117  return *this;
118  }
119 
121  {
122  names_from_first_col_ = value;
123  return *this;
124  }
125 
127  {
128  return reader_;
129  }
130 
131  CsvReader const& csv_reader() const
132  {
133  return reader_;
134  }
135 
136  DataframeReader& parse_value_functor( std::function<T( std::string const& )> functor )
137  {
138  parse_value_ = functor;
139  return *this;
140  }
141 
142  // -------------------------------------------------------------
143  // Internal Functions
144  // -------------------------------------------------------------
145 
146 private:
147 
148  Dataframe<T> parse_( utils::InputStream& input_stream ) const
149  {
150  Dataframe<T> result;
151  size_t const offset = ( names_from_first_col_ ? 1 : 0 );
152 
153  // Early stop.
154  if( ! input_stream ) {
155  return result;
156  }
157 
158  // Read column names.
159  if( names_from_first_row_ ) {
160  auto const col_names = reader_.parse_line( input_stream );
161  size_t const start = offset;
162  for( size_t i = start; i < col_names.size(); ++i ) {
163  result.add_col( col_names[i] );
164  }
165  }
166 
167  // Read lines of data.
168  while( input_stream ) {
169  auto const line = reader_.parse_line( input_stream );
170 
171  // Need to have a least one content element.
172  if(( line.size() == 0 ) || ( names_from_first_col_ && line.size() == 1 )) {
173  throw std::runtime_error( "Cannot read Dataframe with empty lines." );
174  }
175  assert( line.size() > offset );
176 
177  // Add a row for the line. Use row name if wanted.
178  if( names_from_first_col_ ) {
179  result.add_row( line[0] );
180  } else {
181  result.add_row();
182  }
183 
184  // If there was no column names, make columns.
185  if( result.cols() == 0 ) {
186  // This can only happen in the first line, and if no col names were read.
187  assert( result.rows() == 1 );
188  assert( ! names_from_first_row_ );
189 
190  // Add unnamed cols.
191  for( size_t i = offset; i < line.size(); ++i ) {
192  result.add_col();
193  }
194  }
195 
196  // Check if the line has the correct size.
197  if( line.size() != offset + result.cols() ) {
198  throw std::runtime_error( "Dataframe input has different line lengths." );
199  }
200 
201  // Parse and transfer the data. User specified parser or default one.
202  auto const row_idx = result.rows() - 1;
203  if( parse_value_ ) {
204  for( size_t i = 0; i < result.cols(); ++i ) {
205  result( row_idx, i ) = parse_value_( line[ offset + i ] );
206  }
207  } else {
208  for( size_t i = 0; i < result.cols(); ++i ) {
209  result( row_idx, i ) = parse_value_stringstream_( line[ offset + i ] );
210  }
211  }
212  }
213 
214  return result;
215  }
216 
217  inline T parse_value_stringstream_( std::string const& cell ) const
218  {
219  std::stringstream ss( cell );
220  T value;
221  ss >> value;
222  return value;
223  }
224 
225  // -------------------------------------------------------------
226  // Data Members
227  // -------------------------------------------------------------
228 
229 private:
230 
231  bool names_from_first_row_ = true;
232  bool names_from_first_col_ = true;
233 
234  CsvReader reader_;
235 
236  std::function<T( std::string const& )> parse_value_;
237 
238 };
239 
240 } // namespace utils
241 } // namespace genesis
242 
243 #endif // include guard
void offset(Histogram &h, double value)
Definition: operations.cpp:47
DataframeReader & parse_value_functor(std::function< T(std::string const &)> functor)
CsvReader & separator_chars(std::string const &chars)
Set the chars used to separate fields of the CSV data.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
Provides some valuable additions to STD.
Dataframe< T > from_file(std::string const &fn) const
size_type rows() const
Definition: dataframe.hpp:354
Dataframe< T > from_string(std::string const &fs) const
Dataframe< T > from_stream(std::istream &is) const
size_type cols() const
Definition: dataframe.hpp:359
DataframeReader & operator=(DataframeReader const &)=default
Read Comma Separated Values (CSV) data and other delimiter-separated formats.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...