A library for working with phylogenetic and population genetic data.
v0.32.0
utils/containers/dataframe/reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_CONTAINERS_DATAFRAME_READER_H_
2 #define GENESIS_UTILS_CONTAINERS_DATAFRAME_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2023 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
41 
42 #include <functional>
43 #include <iostream>
44 #include <sstream>
45 #include <stdexcept>
46 #include <string>
47 #include <type_traits>
48 #include <vector>
49 
50 namespace genesis {
51 namespace utils {
52 
53 // =================================================================================================
54 // DataframeReader
55 // =================================================================================================
56 
57 template<typename T = std::string>
59 {
60 public:
61 
62  // -------------------------------------------------------------
63  // Constructors and Rule of Five
64  // -------------------------------------------------------------
65 
66  explicit DataframeReader( char separator_char = ',' )
67  {
68  reader_.separator_chars( std::string( 1, separator_char ));
69  }
70 
71  explicit DataframeReader( CsvReader const& reader )
72  : reader_( reader )
73  {}
74 
75  ~DataframeReader() = default;
76 
77  DataframeReader(DataframeReader const&) = default;
78  DataframeReader(DataframeReader&&) = default;
79 
80  DataframeReader& operator= (DataframeReader const&) = default;
82 
83  // -------------------------------------------------------------
84  // Reading
85  // -------------------------------------------------------------
86 
88  std::shared_ptr<BaseInputSource> source
89  ) const {
90  utils::InputStream is( source );
91  return parse_( is );
92  }
93 
94  // -------------------------------------------------------------
95  // Properties
96  // -------------------------------------------------------------
97 
99  {
100  return col_names_from_first_row_;
101  }
102 
104  {
105  return row_names_from_first_col_;
106  }
107 
109  {
110  col_names_from_first_row_ = value;
111  return *this;
112  }
113 
115  {
116  row_names_from_first_col_ = value;
117  return *this;
118  }
119 
121  {
122  return reader_;
123  }
124 
125  CsvReader const& csv_reader() const
126  {
127  return reader_;
128  }
129 
130  bool trim_whitespace() const
131  {
132  return trim_whitespace_;
133  }
134 
136  {
137  trim_whitespace_ = value;
138  return *this;
139  }
140 
141  DataframeReader& parse_value_functor( std::function<T( std::string const& )> functor )
142  {
143  parse_value_ = functor;
144  return *this;
145  }
146 
147  // -------------------------------------------------------------
148  // Internal Functions
149  // -------------------------------------------------------------
150 
151 private:
152 
153  Dataframe parse_(
154  utils::InputStream& input_stream
155  ) const {
156  Dataframe result;
157  size_t const offset = ( row_names_from_first_col_ ? 1 : 0 );
158  size_t line_cnt = 0;
159 
160  // Early stop.
161  if( ! input_stream ) {
162  return result;
163  }
164 
165  // Read column names.
166  if( col_names_from_first_row_ ) {
167  auto const col_names = reader_.parse_line( input_stream );
168  ++line_cnt;
169 
170  size_t const start = offset;
171  for( size_t i = start; i < col_names.size(); ++i ) {
172  result.add_col<T>( col_names[i] );
173  }
174  }
175 
176  // Read lines of data.
177  while( input_stream ) {
178  auto const line = reader_.parse_line( input_stream );
179  ++line_cnt;
180 
181  // Need to have a least one content element.
182  if(( line.size() == 0 ) || ( row_names_from_first_col_ && line.size() == 1 )) {
183  throw std::runtime_error(
184  "Cannot read Dataframe with lines that do not contain any content (line " +
185  std::to_string( line_cnt ) + "). Maybe the separator char is wrong."
186  );
187  }
188  assert( line.size() > offset );
189 
190  // Add a row for the line. Use row name if wanted.
191  if( row_names_from_first_col_ ) {
192  result.add_row( line[0] );
193  } else {
194  result.add_unnamed_row();
195  }
196 
197  // If there was no column names, make columns.
198  if( result.cols() == 0 ) {
199  // This can only happen in the first line, and if no col names were read.
200  assert( result.rows() == 1 );
201  assert( ! col_names_from_first_row_ );
202 
203  // Add unnamed cols.
204  for( size_t i = offset; i < line.size(); ++i ) {
205  result.add_unnamed_col<T>();
206  }
207  assert( line.size() == offset + result.cols() );
208  }
209 
210  // Check if the line has the correct size.
211  if( line.size() != offset + result.cols() ) {
212  throw std::runtime_error(
213  "Dataframe input has different line lengths (line " +
214  std::to_string( line_cnt ) + ")."
215  );
216  }
217 
218  // Parse and transfer the data. User specified parser or default one.
219  auto const row_idx = result.rows() - 1;
220  if( parse_value_ ) {
221  for( size_t i = 0; i < result.cols(); ++i ) {
222  auto& col = dynamic_cast<Dataframe::Column<T>&>(result[i]);
223  col[row_idx] = parse_value_(
224  trim_whitespace_ ? trim(line[ offset + i ]) : line[ offset + i ]
225  );
226 
227  // Some old ideas, for reference.
228  // result( row_idx, i ) = parse_value_( line[ offset + i ] );
229  // result[i][row_idx] = parse_value_( line[ offset + i ] );
230  }
231  } else {
232  for( size_t i = 0; i < result.cols(); ++i ) {
233  auto& col = dynamic_cast<Dataframe::Column<T>&>(result[i]);
234 
235  // Here, we assume that the value we are reading is the only thing in the str.
236  // The Csv Reader offers to trim chars (eg whitespace), but does not do so by default,
237  // in order to follow the csv specification, which states that any whitespace is considered
238  // to be part of the field. So, we treat this specification with respect, and also do not
239  // trim it here by default. That means, we fail whenever there is whitespace.
240  // The option trim_whitespace() is then used to allow whitespace around each cell.
241 
242  // We need to catch exceptions, in order to give more useful error messages
243  // here. In the normal non-throw case, this does not cost us any speed,
244  // so this is okay.
245  try {
246  col[row_idx] = convert_from_string<T>(
247  trim_whitespace_ ? trim(line[ offset + i ]) : line[ offset + i ]
248  );
249  } catch(...) {
250  throw std::runtime_error(
251  "In " + input_stream.source_name() + " line " +
252  std::to_string( line_cnt ) + ": "
253  "Cannot parse value \"" + line[ offset + i ] + "\" into Dataframe. "
254  "Either the input data does not represent values of the specified data "
255  "type, or the input data table contains whitespace around the fields. "
256  "If the latter, allow to trim the respective whitespace chars by "
257  "setting the CsvReader::trim_chars() option accordingly."
258  );
259  }
260 
261  // Some old ideas, for reference.
262  // result( row_idx, i ) = parse_value_default_( line[ offset + i ] );
263  // result[i][row_idx] = parse_value_default_( line[ offset + i ] );
264  // col[row_idx] = parse_value_default_<T>( line[ offset + i ] );
265  }
266  }
267  }
268 
269  assert( result.rows() == line_cnt - ( col_names_from_first_row_ ? 1 : 0 ));
270  return result;
271  }
272 
273  // -------------------------------------------------------------
274  // Data Members
275  // -------------------------------------------------------------
276 
277 private:
278 
279  bool col_names_from_first_row_ = true;
280  bool row_names_from_first_col_ = true;
281  bool trim_whitespace_ = false;
282 
283  CsvReader reader_;
284 
285  std::function<T( std::string const& )> parse_value_;
286 
287 };
288 
289 } // namespace utils
290 } // namespace genesis
291 
292 #endif // include guard
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::utils::DataframeReader::trim_whitespace
DataframeReader & trim_whitespace(bool value)
Definition: utils/containers/dataframe/reader.hpp:135
genesis::utils::CsvReader::parse_line
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
Definition: utils/formats/csv/reader.cpp:160
genesis::utils::CsvReader::separator_chars
CsvReader & separator_chars(std::string const &chars)
Set the chars used to separate fields of the CSV data.
Definition: utils/formats/csv/reader.hpp:259
genesis::utils::InputStream::source_name
std::string source_name() const
Get the input source name where this stream reads from.
Definition: input_stream.hpp:478
genesis::utils::DataframeReader::csv_reader
CsvReader const & csv_reader() const
Definition: utils/containers/dataframe/reader.hpp:125
genesis::utils::DataframeReader::col_names_from_first_row
bool col_names_from_first_row() const
Definition: utils/containers/dataframe/reader.hpp:98
genesis::utils::DataframeReader::trim_whitespace
bool trim_whitespace() const
Definition: utils/containers/dataframe/reader.hpp:130
genesis::utils::trim
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces (or any other delimiters).
Definition: string.cpp:827
genesis::utils::offset
void offset(Histogram &h, double value)
Definition: operations.cpp:47
std.hpp
Provides some valuable additions to STD.
genesis::utils::Dataframe::cols
size_type cols() const
Definition: containers/dataframe.hpp:617
genesis::utils::DataframeReader::csv_reader
CsvReader & csv_reader()
Definition: utils/containers/dataframe/reader.hpp:120
genesis::utils::DataframeReader::DataframeReader
DataframeReader(CsvReader const &reader)
Definition: utils/containers/dataframe/reader.hpp:71
input_source.hpp
genesis::utils::Dataframe::add_row
self_type & add_row(std::string const &name)
Definition: containers/dataframe.hpp:878
reader.hpp
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::DataframeReader::operator=
DataframeReader & operator=(DataframeReader const &)=default
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis::utils::DataframeReader::parse_value_functor
DataframeReader & parse_value_functor(std::function< T(std::string const &)> functor)
Definition: utils/containers/dataframe/reader.hpp:141
genesis::utils::DataframeReader::DataframeReader
DataframeReader(char separator_char=',')
Definition: utils/containers/dataframe/reader.hpp:66
genesis::utils::DataframeReader::~DataframeReader
~DataframeReader()=default
genesis::utils::DataframeReader::row_names_from_first_col
DataframeReader & row_names_from_first_col(bool value)
Definition: utils/containers/dataframe/reader.hpp:114
genesis::utils::Dataframe::add_unnamed_col
Column< T > & add_unnamed_col()
Definition: containers/dataframe.hpp:763
genesis::utils::DataframeReader
Definition: utils/containers/dataframe/reader.hpp:58
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::DataframeReader::col_names_from_first_row
DataframeReader & col_names_from_first_row(bool value)
Definition: utils/containers/dataframe/reader.hpp:108
convert.hpp
genesis::utils::DataframeReader::row_names_from_first_col
bool row_names_from_first_col() const
Definition: utils/containers/dataframe/reader.hpp:103
genesis::utils::CsvReader
Read Comma/Character Separated Values (CSV) data and other delimiter-separated formats.
Definition: utils/formats/csv/reader.hpp:70
genesis::utils::Dataframe::add_unnamed_row
self_type & add_unnamed_row()
Definition: containers/dataframe.hpp:866
genesis::utils::Dataframe
Definition: containers/dataframe.hpp:59
dataframe.hpp
genesis::utils::Dataframe::rows
size_type rows() const
Definition: containers/dataframe.hpp:612
genesis::utils::DataframeReader::read
Dataframe read(std::shared_ptr< BaseInputSource > source) const
Definition: utils/containers/dataframe/reader.hpp:87
genesis::utils::Dataframe::add_col
Column< T > & add_col(std::string const &name)
Definition: containers/dataframe.hpp:809