A toolkit for working with phylogenetic data.
v0.24.0
utils/containers/dataframe/reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_CONTAINERS_DATAFRAME_READER_H_
2 #define GENESIS_UTILS_CONTAINERS_DATAFRAME_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
41 
42 #include <functional>
43 #include <iostream>
44 #include <sstream>
45 #include <stdexcept>
46 #include <string>
47 #include <type_traits>
48 #include <vector>
49 
50 namespace genesis {
51 namespace utils {
52 
53 // =================================================================================================
54 // DataframeReader
55 // =================================================================================================
56 
57 template<typename T = std::string>
59 {
60 public:
61 
62  // -------------------------------------------------------------
63  // Constructors and Rule of Five
64  // -------------------------------------------------------------
65 
66  explicit DataframeReader( char separator_char = ',' )
67  {
68  reader_.separator_chars( std::string( 1, separator_char ));
69  }
70 
71  explicit DataframeReader( CsvReader const& reader )
72  : reader_( reader )
73  {}
74 
75  ~DataframeReader() = default;
76 
77  DataframeReader(DataframeReader const&) = default;
78  DataframeReader(DataframeReader&&) = default;
79 
80  DataframeReader& operator= (DataframeReader const&) = default;
82 
83  // -------------------------------------------------------------
84  // Reading
85  // -------------------------------------------------------------
86 
88  std::shared_ptr<BaseInputSource> source
89  ) const {
90  utils::InputStream is( source );
91  return parse_( is );
92  }
93 
94  // -------------------------------------------------------------
95  // Properties
96  // -------------------------------------------------------------
97 
99  {
100  return col_names_from_first_row_;
101  }
102 
104  {
105  return row_names_from_first_col_;
106  }
107 
109  {
110  col_names_from_first_row_ = value;
111  return *this;
112  }
113 
115  {
116  row_names_from_first_col_ = value;
117  return *this;
118  }
119 
121  {
122  return reader_;
123  }
124 
125  CsvReader const& csv_reader() const
126  {
127  return reader_;
128  }
129 
130  bool trim_whitespace() const
131  {
132  return trim_whitespace_;
133  }
134 
136  {
137  trim_whitespace_ = value;
138  return *this;
139  }
140 
141  DataframeReader& parse_value_functor( std::function<T( std::string const& )> functor )
142  {
143  parse_value_ = functor;
144  return *this;
145  }
146 
147  // -------------------------------------------------------------
148  // Internal Functions
149  // -------------------------------------------------------------
150 
151 private:
152 
153  Dataframe parse_(
154  utils::InputStream& input_stream
155  ) const {
156  Dataframe result;
157  size_t const offset = ( row_names_from_first_col_ ? 1 : 0 );
158  size_t line_cnt = 0;
159 
160  // Early stop.
161  if( ! input_stream ) {
162  return result;
163  }
164 
165  // Read column names.
166  if( col_names_from_first_row_ ) {
167  auto const col_names = reader_.parse_line( input_stream );
168  ++line_cnt;
169 
170  size_t const start = offset;
171  for( size_t i = start; i < col_names.size(); ++i ) {
172  result.add_col<T>( col_names[i] );
173  }
174  }
175 
176  // Read lines of data.
177  while( input_stream ) {
178  auto const line = reader_.parse_line( input_stream );
179  ++line_cnt;
180 
181  // Need to have a least one content element.
182  if(( line.size() == 0 ) || ( row_names_from_first_col_ && line.size() == 1 )) {
183  throw std::runtime_error(
184  "Cannot read Dataframe with lines that do not contain any content (line " +
185  std::to_string( line_cnt ) + "). Maybe the separator char is wrong."
186  );
187  }
188  assert( line.size() > offset );
189 
190  // Add a row for the line. Use row name if wanted.
191  if( row_names_from_first_col_ ) {
192  result.add_row( line[0] );
193  } else {
194  result.add_unnamed_row();
195  }
196 
197  // If there was no column names, make columns.
198  if( result.cols() == 0 ) {
199  // This can only happen in the first line, and if no col names were read.
200  assert( result.rows() == 1 );
201  assert( ! col_names_from_first_row_ );
202 
203  // Add unnamed cols.
204  for( size_t i = offset; i < line.size(); ++i ) {
205  result.add_unnamed_col<T>();
206  }
207  assert( line.size() == offset + result.cols() );
208  }
209 
210  // Check if the line has the correct size.
211  if( line.size() != offset + result.cols() ) {
212  throw std::runtime_error(
213  "Dataframe input has different line lengths (line " +
214  std::to_string( line_cnt ) + ")."
215  );
216  }
217 
218  // Parse and transfer the data. User specified parser or default one.
219  auto const row_idx = result.rows() - 1;
220  if( parse_value_ ) {
221  for( size_t i = 0; i < result.cols(); ++i ) {
222  auto& col = dynamic_cast<Dataframe::Column<T>&>(result[i]);
223  col[row_idx] = parse_value_(
224  trim_whitespace_ ? trim(line[ offset + i ]) : line[ offset + i ]
225  );
226 
227  // Some old ideas, for reference.
228  // result( row_idx, i ) = parse_value_( line[ offset + i ] );
229  // result[i][row_idx] = parse_value_( line[ offset + i ] );
230  }
231  } else {
232  for( size_t i = 0; i < result.cols(); ++i ) {
233  auto& col = dynamic_cast<Dataframe::Column<T>&>(result[i]);
234 
235  // Here, we assume that the value we are reading is the only thing in the str.
236  // The Csv Reader offers to trim chars (eg whitespace), but does not do so by default,
237  // in order to follow the csv specification, which states that any whitespace is considered
238  // to be part of the field. So, we treat this specification with respect, and also do not
239  // trim it here by default. That means, we fail whenever there is whitespace.
240  // The option trim_whitespace() is then used to allow whitespace around each cell.
241 
242  // We need to catch exceptions, in order to give more useful error messages
243  // here. In the normal non-throw case, this does not cost us any speed,
244  // so this is okay.
245  try {
246  col[row_idx] = convert_from_string<T>(
247  trim_whitespace_ ? trim(line[ offset + i ]) : line[ offset + i ]
248  );
249  } catch(...) {
250  throw std::runtime_error(
251  "Cannot parse value \"" + line[ offset + i ] + "\" into Dataframe. "
252  "Either the input data does not represent values of the specified data "
253  "type, or the input data table contains whitespace around the fields. "
254  "If the latter, allow to trim the respective whitespace chars by "
255  "setting the CsvReader::trim_chars() option accordingly."
256  );
257  }
258 
259  // Some old ideas, for reference.
260  // result( row_idx, i ) = parse_value_default_( line[ offset + i ] );
261  // result[i][row_idx] = parse_value_default_( line[ offset + i ] );
262  // col[row_idx] = parse_value_default_<T>( line[ offset + i ] );
263  }
264  }
265  }
266 
267  assert( result.rows() == line_cnt - ( col_names_from_first_row_ ? 1 : 0 ));
268  return result;
269  }
270 
271  // -------------------------------------------------------------
272  // Data Members
273  // -------------------------------------------------------------
274 
275 private:
276 
277  bool col_names_from_first_row_ = true;
278  bool row_names_from_first_col_ = true;
279  bool trim_whitespace_ = false;
280 
281  CsvReader reader_;
282 
283  std::function<T( std::string const& )> parse_value_;
284 
285 };
286 
287 } // namespace utils
288 } // namespace genesis
289 
290 #endif // include guard
void offset(Histogram &h, double value)
Definition: operations.cpp:47
DataframeReader & parse_value_functor(std::function< T(std::string const &)> functor)
Column< T > & add_col(std::string const &name)
CsvReader & separator_chars(std::string const &chars)
Set the chars used to separate fields of the CSV data.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
Provides some valuable additions to STD.
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:394
DataframeReader & row_names_from_first_col(bool value)
Provides some commonly used string utility functions.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
self_type & add_row(std::string const &name)
DataframeReader & operator=(DataframeReader const &)=default
Read Comma/Character Separated Values (CSV) data and other delimiter-separated formats.
Dataframe read(std::shared_ptr< BaseInputSource > source) const
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
DataframeReader & col_names_from_first_row(bool value)
Stream interface for reading data from an InputSource, that keeps track of line and column counters...