A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utils/containers/dataframe/reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_CONTAINERS_DATAFRAME_READER_H_
2 #define GENESIS_UTILS_CONTAINERS_DATAFRAME_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
38 
39 #include <functional>
40 #include <stdexcept>
41 #include <sstream>
42 #include <string>
43 #include <vector>
44 
45 namespace genesis {
46 namespace utils {
47 
48 // =================================================================================================
49 // DataframeReader
50 // =================================================================================================
51 
52 template <typename T>
54 {
55 public:
56 
57  // -------------------------------------------------------------
58  // Constructors and Rule of Five
59  // -------------------------------------------------------------
60 
61  DataframeReader( char separator_char = ',' )
62  {
63  reader_.separator_chars( std::string( 1, separator_char ));
64  }
65 
66  DataframeReader( CsvReader const& reader )
67  : reader_( reader )
68  {}
69 
70  ~DataframeReader() = default;
71 
72  DataframeReader(DataframeReader const&) = default;
73  DataframeReader(DataframeReader&&) = default;
74 
75  DataframeReader& operator= (DataframeReader const&) = default;
77 
78  // -------------------------------------------------------------
79  // Reading
80  // -------------------------------------------------------------
81 
82  Dataframe<T> from_stream( std::istream& is ) const
83  {
84  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( is ));
85  return parse_( it );
86  }
87 
88  Dataframe<T> from_file ( std::string const& fn ) const
89  {
90  utils::InputStream it( utils::make_unique< utils::FileInputSource >( fn ));
91  return parse_( it );
92  }
93 
94  Dataframe<T> from_string( std::string const& fs ) const
95  {
96  utils::InputStream it( utils::make_unique< utils::StringInputSource >( fs ));
97  return parse_( it );
98  }
99 
100  // -------------------------------------------------------------
101  // Properties
102  // -------------------------------------------------------------
103 
104  bool names_from_first_row() const
105  {
106  return names_from_first_row_;
107  }
108 
109  bool names_from_first_col() const
110  {
111  return names_from_first_col_;
112  }
113 
115  {
116  names_from_first_row_ = value;
117  return *this;
118  }
119 
121  {
122  names_from_first_col_ = value;
123  return *this;
124  }
125 
127  {
128  return reader_;
129  }
130 
131  CsvReader const& csv_reader() const
132  {
133  return reader_;
134  }
135 
136  DataframeReader& parse_value_functor( std::function<T( std::string const& )> functor )
137  {
138  parse_value_ = functor;
139  }
140 
141  // -------------------------------------------------------------
142  // Internal Functions
143  // -------------------------------------------------------------
144 
145 private:
146 
147  Dataframe<T> parse_( utils::InputStream& input_stream ) const
148  {
149  Dataframe<T> result;
150  size_t const offset = ( names_from_first_col_ ? 1 : 0 );
151 
152  // Early stop.
153  if( ! input_stream ) {
154  return result;
155  }
156 
157  // Read column names.
158  if( names_from_first_row_ ) {
159  auto const col_names = reader_.parse_line( input_stream );
160  size_t const start = offset;
161  for( size_t i = start; i < col_names.size(); ++i ) {
162  result.add_col( col_names[i] );
163  }
164  }
165 
166  // Read lines of data.
167  while( input_stream ) {
168  auto const line = reader_.parse_line( input_stream );
169 
170  // Need to have a least one content element.
171  if(( line.size() == 0 ) || ( names_from_first_col_ && line.size() == 1 )) {
172  throw std::runtime_error( "Cannot read Dataframe with empty lines." );
173  }
174  assert( line.size() > offset );
175 
176  // Add a row for the line. Use row name if wanted.
177  if( names_from_first_col_ ) {
178  result.add_row( line[0] );
179  } else {
180  result.add_row();
181  }
182 
183  // If there was no column names, make columns.
184  if( result.cols() == 0 ) {
185  // This can only happen in the first line, and if no col names were read.
186  assert( result.rows() == 1 );
187  assert( ! names_from_first_row_ );
188 
189  // Add unnamed cols.
190  for( size_t i = offset; i < line.size(); ++i ) {
191  result.add_col();
192  }
193  }
194 
195  // Check if the line has the correct size.
196  if( line.size() != offset + result.cols() ) {
197  throw std::runtime_error( "Dataframe input has different line lengths." );
198  }
199 
200  // Parse and transfer the data. User specified parser or default one.
201  auto const row_idx = result.rows() - 1;
202  if( parse_value_ ) {
203  for( size_t i = 0; i < result.cols(); ++i ) {
204  result( row_idx, i ) = parse_value_( line[ offset + i ] );
205  }
206  } else {
207  for( size_t i = 0; i < result.cols(); ++i ) {
208  result( row_idx, i ) = parse_value_stringstream_( line[ offset + i ] );
209  }
210  }
211  }
212 
213  return result;
214  }
215 
216  inline T parse_value_stringstream_( std::string const& cell ) const
217  {
218  std::stringstream ss( cell );
219  T value;
220  ss >> value;
221  return value;
222  }
223 
224  // -------------------------------------------------------------
225  // Data Members
226  // -------------------------------------------------------------
227 
228 private:
229 
230  bool names_from_first_row_ = true;
231  bool names_from_first_col_ = true;
232 
233  CsvReader reader_;
234 
235  std::function<T( std::string const& )> parse_value_;
236 
237 };
238 
239 } // namespace utils
240 } // namespace genesis
241 
242 #endif // include guard
void offset(Histogram &h, double value)
Definition: operations.cpp:47
DataframeReader & parse_value_functor(std::function< T(std::string const &)> functor)
CsvReader & separator_chars(std::string const &chars)
Set the chars used to separate fields of the CSV data.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
Provides some valuable additions to STD.
Dataframe< T > from_file(std::string const &fn) const
size_type rows() const
Definition: dataframe.hpp:330
Dataframe< T > from_string(std::string const &fs) const
Dataframe< T > from_stream(std::istream &is) const
size_type cols() const
Definition: dataframe.hpp:335
DataframeReader & operator=(DataframeReader const &)=default
Read Comma Separated Values (CSV) data and other delimiter-separated formats.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...