A toolkit for working with phylogenetic data.
v0.24.0
utils/formats/csv/reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
33 #include <algorithm>
34 #include <cassert>
35 #include <cctype>
36 #include <fstream>
37 #include <sstream>
38 #include <stdexcept>
39 
46 
47 namespace genesis {
48 namespace utils {
49 
50 // =================================================================================================
51 // Reading
52 // =================================================================================================
53 
54 CsvReader::Table CsvReader::read( std::shared_ptr<BaseInputSource> source ) const
55 {
56  utils::InputStream it( source );
57  return parse_document( it );
58 }
59 
60 // =================================================================================================
61 // Parse Document
62 // =================================================================================================
63 
65  utils::InputStream& input_stream
66 ) const {
67  auto& it = input_stream;
68  Table result;
69 
70  while( it ) {
71  // Parse the next line and push it if it has content.
72  // (If the file ends on empty lines, this might not be the case).
73  auto line = parse_line( it );
74  if( line.size() > 0 ) {
75  result.push_back( line );
76  }
77  }
78 
79  return result;
80 }
81 
82 // =================================================================================================
83 // Parse Field
84 // =================================================================================================
85 
86 std::string CsvReader::parse_field( utils::InputStream& input_stream ) const
87 {
88  // Init. We use a buffer in order to keep memory reallocations to a minimum.
89  // See see member variable in the class declaration for more information.
90  auto& it = input_stream;
91  buffer_ = "";
92 
93  // Trim the start of the field by skipping chars if needed.
94  // (The end needs to be actually trimmed, as we don't know when it comes.)
95  skip_while( it, [&] ( char c ) {
96  return trim_chars_.find( c ) != std::string::npos;
97  });
98 
99  // Read as long as there is input. We will break when finding a new line later.
100  while( it ) {
101 
102  // Treat escape sequences if needed.
103  if( use_escapes_ && *it == '\\' ) {
104 
105  // Skip the backslash.
106  ++it;
107 
108  // We found an escaping backslash. This cannot be the end of the stream.
109  if( !it ) {
110  throw std::runtime_error(
111  "Unexpected end of string at " + it.at() + ". Expecting escape sequence."
112  );
113  }
114 
115  // De-escape.
116  buffer_ += deescape( *it );
117 
118  // Next char. We skip the rest of this loop, as we already treated the current char.
119  ++it;
120  continue;
121  }
122 
123  // Finish reading at the end of the line or when one of the separator chars is found.
124  if( *it == '\n' || separator_chars_.find( *it ) != std::string::npos ) {
125  break;
126  }
127 
128  // Parse quoted strings if needed.
129  // We add them to the result, even when they occur in the middle of a field.
130  if( quotation_chars_.find( *it ) != std::string::npos ) {
131 
132  // If the parsing results in an empty string, this means that there were two
133  // consecutive quotation marks. So, if also use_twin_quotes is activated, we need
134  // to add one quotation mark to the result.
135  // In all other cases (i.e., there was content in the quoted string, or we do not
136  // use twin quotes), add this content to the result.
137  char qm = *it;
138  auto qs = parse_quoted_string( it, use_escapes_, use_twin_quotes_, false );
139  if( qs == "" && use_twin_quotes_ ) {
140  buffer_ += qm;
141  } else {
142  buffer_ += qs;
143  }
144  continue;
145  }
146 
147  // In any other case, simply read the char.
148  buffer_ += *it;
149  ++it;
150  }
151 
152  // Now do the last trimming step and return the result.
153  return trim_right( buffer_, trim_chars_ );
154 }
155 
156 // =================================================================================================
157 // Parse Line
158 // =================================================================================================
159 
160 std::vector<std::string> CsvReader::parse_line( utils::InputStream& input_stream ) const
161 {
162  // Init.
163  auto& it = input_stream;
164  std::vector<std::string> result;
165  size_t field_count = 0;
166 
167  // Read until one of the inner breaking conditions applies.
168  // We need this to make sure that the stream can also end with a separator char
169  // (it then depends on the settings whether an empty field is added to the line).
170  while( true ) {
171 
172  // Skip comment lines if needed.
173  while( comment_chars_.find( *it ) != std::string::npos ) {
174  skip_until( it, '\n' );
175  assert( *it == '\n' );
176  ++it;
177  }
178 
179  auto field = parse_field( it );
180  ++field_count;
181 
182  // Store the field if it has content. If not, store it anyway if we do not want to
183  // merge adjacent separators (i.e., leave out empty fields).
184  if( field.size() > 0 || ! merge_separators_ ) {
185  result.push_back( field );
186  }
187 
188  // No more input or end of the line. Leave.
189  if( ! it || *it == '\n' ) {
190  // We can go to the next char even if its the end of the stream. Nothing bad happens.
191  ++it;
192 
193  // Skip empty lines and continue parsing, if needed.
194  // We need the additional field counter to make sure that we do not skip lines that
195  // "seem" empty because all their fields are empty and were merged (in case
196  // merge_separator is true).
197  if( skip_empty_lines_
198  && field_count == 1
199  && std::all_of( field.begin(), field.end(), isblank )
200  ) {
201 
202  // Special case: The file ends on an empty line.
203  // We then return an empty vector as a sign that there was nothing left -
204  // the reader functions will not add a line then.
205  if( ! it ) {
206  return std::vector<std::string>();
207  }
208 
209  // Reset and parse next line.
210  result.clear();
211  field_count = 0;
212  continue;
213  }
214 
215  // If this was not an empty line that we skipped, we are done with this line.
216  break;
217  }
218 
219  // If we are here, parse_field left the stream at the separator char. Assert that this
220  // is the case. We do not want to check this in release, as it is expensive.
221  // Move to the next char, so that we can scan the next field.
222  assert( separator_chars_.find( *it ) != std::string::npos );
223  ++it;
224  }
225 
226  // Special case: Merge separators is set to true and all fields were empty. This results
227  // in no content, but we at least want to return one empty field for that line.
228  if( result.size() == 0 ) {
229  assert( merge_separators_ == true );
230  result.push_back( "" );
231  }
232 
233  return result;
234 }
235 
236 } // namespace utils
237 } // namespace genesis
std::string parse_quoted_string(utils::InputStream &source, bool use_escapes, bool use_twin_quotes, bool include_qmarks)
Read a string in quotation marks from a stream and return it.
Definition: parser.cpp:116
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one...
Definition: scanner.hpp:153
std::string deescape(std::string const &text)
Return a string where backslash-escaped characters are transformed into their respective string form...
Definition: string.cpp:507
std::string parse_field(utils::InputStream &input_stream) const
Parse one field (i.e., one cell) of the CSV data and return it.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
Provides some valuable additions to STD.
std::string trim_right(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with left trimmed white spaces.
Definition: string.cpp:370
Provides some commonly used string utility functions.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
Provides functions for accessing the file system.
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one...
Definition: scanner.hpp:184
Table parse_document(utils::InputStream &input_stream) const
Parse a whole CSV document and return its contents.
Table read(std::shared_ptr< BaseInputSource > source) const
Read CSV data from a source and return it as a table, using a vector per line, containing a vector of...
Stream interface for reading data from an InputSource, that keeps track of line and column counters...