A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utils/formats/csv/reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
33 #include <algorithm>
34 #include <assert.h>
35 #include <cctype>
36 #include <fstream>
37 #include <sstream>
38 #include <stdexcept>
39 
46 
47 namespace genesis {
48 namespace utils {
49 
50 // =================================================================================================
51 // Reading
52 // =================================================================================================
53 
57 CsvReader::table CsvReader::from_stream( std::istream& is ) const
58 {
59  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( is ));
60  return parse_document( it );
61 }
62 
66 CsvReader::table CsvReader::from_file( std::string const& fn ) const
67 {
68  utils::InputStream it( utils::make_unique< utils::FileInputSource >( fn ));
69  return parse_document( it );
70 }
71 
75 CsvReader::table CsvReader::from_string( std::string const& fs ) const
76 {
77  utils::InputStream it( utils::make_unique< utils::StringInputSource >( fs ));
78  return parse_document( it );
79 }
80 
81 // =================================================================================================
82 // Parsing
83 // =================================================================================================
84 
89  utils::InputStream& input_stream
90 ) const {
91  auto& it = input_stream;
92  table result;
93 
94  while( it ) {
95  // Parse the next line and push it if it has content.
96  // (If the file ends on empty lines, this might not be the case).
97  auto line = parse_line( it );
98  if( line.size() > 0 ) {
99  result.push_back( line );
100  }
101  }
102 
103  return result;
104 }
105 
124 std::string CsvReader::parse_field( utils::InputStream& input_stream ) const
125 {
126  // Init. We use a buffer in order to keep memory reallocations to a minimum.
127  // See see member variable in the class declaration for more information.
128  auto& it = input_stream;
129  buffer_ = "";
130 
131  // Trim the start of the field by skipping chars if needed.
132  // (The end needs to be actually trimmed, as we don't know when it comes.)
133  skip_while( it, [&] ( char c ) {
134  return trim_chars_.find( c ) != std::string::npos;
135  });
136 
137  // Read as long as there is input. We will break when finding a new line later.
138  while( it ) {
139 
140  // Treat escape sequences if needed.
141  if( use_escapes_ && *it == '\\' ) {
142 
143  // Skip the backslash.
144  ++it;
145 
146  // We found an escaping backslash. This cannot be the end of the stream.
147  if( !it ) {
148  throw std::runtime_error(
149  "Unexpected end of string at " + it.at() + ". Expecting escape sequence."
150  );
151  }
152 
153  // De-escape.
154  buffer_ += deescape( *it );
155 
156  // Next char. We skip the rest of this loop, as we already treated the current char.
157  ++it;
158  continue;
159  }
160 
161  // Finish reading at the end of the line or when one of the separator chars is found.
162  if( *it == '\n' || separator_chars_.find( *it ) != std::string::npos ) {
163  break;
164  }
165 
166  // Parse quoted strings if needed.
167  // We add them to the result, even when they occur in the middle of a field.
168  if( quotation_chars_.find( *it ) != std::string::npos ) {
169 
170  // If the parsing results in an empty string, this means that there were two
171  // consecutive quotation marks. So, if also use_twin_quotes is activated, we need
172  // to add one quotation mark to the result.
173  // In all other cases (i.e., there was content in the quoted string, or we do not
174  // use twin quotes), add this content to the result.
175  char qm = *it;
176  auto qs = parse_quoted_string( it, use_escapes_, use_twin_quotes_, false );
177  if( qs == "" && use_twin_quotes_ ) {
178  buffer_ += qm;
179  } else {
180  buffer_ += qs;
181  }
182  continue;
183  }
184 
185  // In any other case, simply read the char.
186  buffer_ += *it;
187  ++it;
188  }
189 
190  // Now do the last trimming step and return the result.
191  return trim_right( buffer_, trim_chars_ );
192 }
193 
204 std::vector<std::string> CsvReader::parse_line( utils::InputStream& input_stream ) const
205 {
206  // Init.
207  auto& it = input_stream;
208  std::vector<std::string> result;
209  size_t field_count = 0;
210 
211  // Read until one of the inner breaking conditions applies.
212  // We need this to make sure that the stream can also end with a separator char
213  // (it then depends on the settings whether an empty field is added to the line).
214  while( true ) {
215 
216  // Skip comment lines if needed.
217  while( comment_chars_.find( *it ) != std::string::npos ) {
218  skip_until( it, '\n' );
219  assert( *it == '\n' );
220  ++it;
221  }
222 
223  auto field = parse_field( it );
224  ++field_count;
225 
226  // Store the field if it has content. If not, store it anyway if we do not want to
227  // merge adjacent separators (i.e., leave out empty fields).
228  if( field.size() > 0 || ! merge_separators_ ) {
229  result.push_back( field );
230  }
231 
232  // No more input or end of the line. Leave.
233  if( ! it || *it == '\n' ) {
234  // We can go to the next char even if its the end of the stream. Nothing bad happens.
235  ++it;
236 
237  // Skip empty lines and continue parsing, if needed.
238  // We need the additional field counter to make sure that we do not skip lines that
239  // "seem" empty because all their fields are empty and were merged (in case
240  // merge_separator is true).
241  if( skip_empty_lines_
242  && field_count == 1
243  && std::all_of( field.begin(), field.end(), isblank )
244  ) {
245 
246  // Special case: The file ends on an empty line.
247  // We then return an empty vector as a sign that there was nothing left -
248  // the reader functions will not add a line then.
249  if( ! it ) {
250  return std::vector<std::string>();
251  }
252 
253  // Reset and parse next line.
254  result.clear();
255  field_count = 0;
256  continue;
257  }
258 
259  // If this was not an empty line that we skipped, we are done with this line.
260  break;
261  }
262 
263  // If we are here, parse_field left the stream at the separator char. Assert that this
264  // is the case. We do not want to check this in release, as it is expensive.
265  // Move to the next char, so that we can scan the next field.
266  assert( separator_chars_.find( *it ) != std::string::npos );
267  ++it;
268  }
269 
270  // Special case: Merge separators is set to true and all fields were empty. This results
271  // in no content, but we at least want to return one empty field for that line.
272  if( result.size() == 0 ) {
273  assert( merge_separators_ == true );
274  result.push_back( "" );
275  }
276 
277  return result;
278 }
279 
280 // =================================================================================================
281 // Properties
282 // =================================================================================================
283 
284 // ---------------------------------------------------------------------
285 // comment_chars
286 // ---------------------------------------------------------------------
287 
298 CsvReader& CsvReader::comment_chars( std::string const& chars )
299 {
300  comment_chars_ = chars;
301  return *this;
302 }
303 
310 std::string const& CsvReader::comment_chars() const
311 {
312  return comment_chars_;
313 }
314 
315 // ---------------------------------------------------------------------
316 // trim_chars
317 // ---------------------------------------------------------------------
318 
328 CsvReader& CsvReader::trim_chars( std::string const& chars )
329 {
330  trim_chars_ = chars;
331  return *this;
332 }
333 
340 std::string const& CsvReader::trim_chars() const
341 {
342  return trim_chars_;
343 }
344 
345 // ---------------------------------------------------------------------
346 // quotation_chars
347 // ---------------------------------------------------------------------
348 
364 CsvReader& CsvReader::quotation_chars( std::string const& chars )
365 {
366  quotation_chars_ = chars;
367  return *this;
368 }
369 
376 std::string const& CsvReader::quotation_chars() const
377 {
378  return quotation_chars_;
379 }
380 
381 // ---------------------------------------------------------------------
382 // separator_chars
383 // ---------------------------------------------------------------------
384 
400 CsvReader& CsvReader::separator_chars( std::string const& chars )
401 {
402  separator_chars_ = chars;
403  return *this;
404 }
405 
412 std::string const& CsvReader::separator_chars() const
413 {
414  return separator_chars_;
415 }
416 
417 // ---------------------------------------------------------------------
418 // skip_empty_lines
419 // ---------------------------------------------------------------------
420 
430 {
431  skip_empty_lines_ = value;
432  return *this;
433 }
434 
441 {
442  return skip_empty_lines_;
443 }
444 
445 // ---------------------------------------------------------------------
446 // merge_separators
447 // ---------------------------------------------------------------------
448 
468 {
469  merge_separators_ = value;
470  return *this;
471 }
472 
479 {
480  return merge_separators_;
481 }
482 
483 // ---------------------------------------------------------------------
484 // use_escapes
485 // ---------------------------------------------------------------------
486 
500 {
501  use_escapes_ = value;
502  return *this;
503 }
504 
511 {
512  return use_escapes_;
513 }
514 
515 // ---------------------------------------------------------------------
516 // use_twin_quotes
517 // ---------------------------------------------------------------------
518 
540 {
541  use_twin_quotes_ = value;
542  return *this;
543 }
544 
551 {
552  return use_twin_quotes_;
553 }
554 
555 } // namespace utils
556 } // namespace genesis
std::string parse_quoted_string(utils::InputStream &source, bool use_escapes, bool use_twin_quotes, bool include_qmarks)
Read a string in quotation marks from a stream and return it.
Definition: parser.cpp:116
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one...
Definition: scanner.hpp:151
table from_string(std::string const &fs) const
Read a string in CSV format and return its contents.
std::string trim_right(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with left trimmed white spaces.
Definition: string.cpp:214
bool use_twin_quotes() const
Return whether to interpret two consequtive quotation marks as a single ("escaped") one...
std::string deescape(std::string const &text)
Return a string where backslash-escaped characters are transformed into their respective string form...
Definition: string.cpp:335
table parse_document(utils::InputStream &input_stream) const
Parse a whole CSV document and return its contents.
std::string parse_field(utils::InputStream &input_stream) const
Parse one field (i.e., one cell) of the CSV data and return it.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
Provides some valuable additions to STD.
table from_stream(std::istream &is) const
Read CSV data until the end of the stream is reached, and return it.
bool merge_separators() const
Return the current setting whether consecutive separators are merged or not.
std::string const & quotation_chars() const
Return the currently set chars for quoting strings in fields.
Provides some commonly used string utility functions.
Provides functions for accessing the file system.
Read Comma Separated Values (CSV) data and other delimiter-separated formats.
bool skip_empty_lines() const
Return whether currently empty lines are skipped.
table from_file(std::string const &fn) const
Read a CSV file and return its contents.
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one...
Definition: scanner.hpp:182
std::string const & trim_chars() const
Return the currently set chars that are trimmed from the start and end of each field.
std::string const & comment_chars() const
Return the currently set chars that are used to mark comment lines.
bool use_escapes() const
Return whether backslash escape sequences are used.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
std::string const & separator_chars() const
Return the currently set chars used to separate fields of the CSV data.