A toolkit for working with phylogenetic data.
v0.24.0
utils/formats/csv/reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_FORMATS_CSV_READER_H_
2 #define GENESIS_UTILS_FORMATS_CSV_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
35 
36 #include <iosfwd>
37 #include <memory>
38 #include <string>
39 #include <vector>
40 
41 namespace genesis {
42 namespace utils {
43 
44 // =================================================================================================
45 // Forward Declarations
46 // =================================================================================================
47 
48 class InputStream;
49 
50 // =================================================================================================
51 // Csv Reader
52 // =================================================================================================
53 
70 class CsvReader
71 {
72 public:
73 
74  // ---------------------------------------------------------------------
75  // Typedefs and Enums
76  // ---------------------------------------------------------------------
77 
78  using Field = std::string;
79  using Line = std::vector<Field>;
80  using Table = std::vector<Line>;
81 
82  // ---------------------------------------------------------------------
83  // Constructor and Rule of Five
84  // ---------------------------------------------------------------------
85 
86  CsvReader() = default;
87  ~CsvReader() = default;
88 
89  CsvReader( CsvReader const& ) = default;
90  CsvReader( CsvReader&& ) = default;
91 
92  CsvReader& operator= ( CsvReader const& ) = default;
93  CsvReader& operator= ( CsvReader&& ) = default;
94 
95  // ---------------------------------------------------------------------
96  // Reading
97  // ---------------------------------------------------------------------
98 
106  Table read( std::shared_ptr<BaseInputSource> source ) const;
107 
108  // ---------------------------------------------------------------------
109  // Parsing
110  // ---------------------------------------------------------------------
111 
116  utils::InputStream& input_stream
117  ) const;
118 
137  std::string parse_field(
138  utils::InputStream& input_stream
139  ) const;
140 
151  std::vector<std::string> parse_line(
152  utils::InputStream& input_stream
153  ) const;
154 
155  // ---------------------------------------------------------------------
156  // Properties
157  // ---------------------------------------------------------------------
158 
169  CsvReader& comment_chars( std::string const& chars )
170  {
171  comment_chars_ = chars;
172  return *this;
173  }
174 
181  std::string const& comment_chars() const
182  {
183  return comment_chars_;
184  }
185 
195  CsvReader& trim_chars( std::string const& chars )
196  {
197  trim_chars_ = chars;
198  return *this;
199  }
200 
207  std::string const& trim_chars() const
208  {
209  return trim_chars_;
210  }
211 
227  CsvReader& quotation_chars( std::string const& chars )
228  {
229  quotation_chars_ = chars;
230  return *this;
231  }
232 
239  std::string const& quotation_chars() const
240  {
241  return quotation_chars_;
242  }
243 
259  CsvReader& separator_chars( std::string const& chars )
260  {
261  separator_chars_ = chars;
262  return *this;
263  }
264 
271  std::string const& separator_chars() const
272  {
273  return separator_chars_;
274  }
275 
285  {
286  skip_empty_lines_ = value;
287  return *this;
288  }
289 
295  bool skip_empty_lines() const
296  {
297  return skip_empty_lines_;
298  }
299 
319  {
320  merge_separators_ = value;
321  return *this;
322  }
323 
329  bool merge_separators() const
330  {
331  return merge_separators_;
332  }
333 
346  CsvReader& use_escapes( bool value )
347  {
348  use_escapes_ = value;
349  return *this;
350  }
351 
357  bool use_escapes() const
358  {
359  return use_escapes_;
360  }
361 
382  CsvReader& use_twin_quotes( bool value )
383  {
384  use_twin_quotes_ = value;
385  return *this;
386  }
387 
393  bool use_twin_quotes() const
394  {
395  return use_twin_quotes_;
396  }
397 
398  // ---------------------------------------------------------------------
399  // Members
400  // ---------------------------------------------------------------------
401 
402 private:
403 
404  // We store the following char sets as strings and use find() to check whether a given char
405  // is part of the sets. This is linear in length of the string. As there are usually just a
406  // few chars in there, this is fast. We also tested with a char lookup table, which offers
407  // constant time, but still was slower. See also http://stackoverflow.com/a/29068727/4184258
408  std::string comment_chars_ = "";
409  std::string trim_chars_ = "";
410  std::string quotation_chars_ = "\"";
411  std::string separator_chars_ = ",";
412 
413  bool skip_empty_lines_ = false;
414  bool merge_separators_ = false;
415  bool use_escapes_ = false;
416  bool use_twin_quotes_ = true;
417 
418  // We use a buffer in order to make copying and resizing strings rare and hence fast.
419  // This buffer will grow for bigger csv input fields (but never shrink). We then copy from it,
420  // so that the new strings are as small as possible. After some fields, the buffer size
421  // approaches a value where it rarely needs to grow any more. Speedup (using a 4MB file): ~20%.
422  mutable std::string buffer_;
423 
424 };
425 
426 } // namespace utils
427 } // namespace genesis
428 
429 #endif // include guard
bool skip_empty_lines() const
Return whether currently empty lines are skipped.
CsvReader & merge_separators(bool value)
Set whether consecutive separater chars are merged or whether each of them creates a new field...
std::string const & separator_chars() const
Return the currently set chars used to separate fields of the CSV data.
CsvReader & use_twin_quotes(bool value)
Set whether to interpret two consequtive quotation marks as a single ("escaped") one.
std::string parse_field(utils::InputStream &input_stream) const
Parse one field (i.e., one cell) of the CSV data and return it.
bool use_escapes() const
Return whether backslash escape sequences are used.
CsvReader & separator_chars(std::string const &chars)
Set the chars used to separate fields of the CSV data.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
bool merge_separators() const
Return the current setting whether consecutive separators are merged or not.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
CsvReader & use_escapes(bool value)
Set whether to use backslash escape sequences.
std::string const & trim_chars() const
Return the currently set chars that are trimmed from the start and end of each field.
bool use_twin_quotes() const
Return whether to interpret two consequtive quotation marks as a single ("escaped") one...
Read Comma/Character Separated Values (CSV) data and other delimiter-separated formats.
CsvReader & trim_chars(std::string const &chars)
Set chars that are trimmed from the start and end of each field.
CsvReader & operator=(CsvReader const &)=default
Table parse_document(utils::InputStream &input_stream) const
Parse a whole CSV document and return its contents.
CsvReader & quotation_chars(std::string const &chars)
Set the chars that are used for quoting strings in fields.
std::string const & comment_chars() const
Return the currently set chars that are used to mark comment lines.
Table read(std::shared_ptr< BaseInputSource > source) const
Read CSV data from a source and return it as a table, using a vector per line, containing a vector of...
CsvReader & skip_empty_lines(bool value)
Set whether to skip empty lines.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
CsvReader & comment_chars(std::string const &chars)
Set chars that are used to mark comment lines.
std::string const & quotation_chars() const
Return the currently set chars for quoting strings in fields.