A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utils/formats/csv/reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_FORMATS_CSV_READER_H_
2 #define GENESIS_UTILS_FORMATS_CSV_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2017 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
34 #include <iosfwd>
35 #include <string>
36 #include <vector>
37 
38 namespace genesis {
39 namespace utils {
40 
41 // =================================================================================================
42 // Forward Declarations
43 // =================================================================================================
44 
45 class InputStream;
46 
47 // =================================================================================================
48 // Csv Reader
49 // =================================================================================================
50 
73 class CsvReader
74 {
75 public:
76 
77  // ---------------------------------------------------------------------
78  // Typedefs and Enums
79  // ---------------------------------------------------------------------
80 
81  typedef std::string field;
82  typedef std::vector<field> row;
83  typedef std::vector<row> table;
84 
85  // ---------------------------------------------------------------------
86  // Constructor and Rule of Five
87  // ---------------------------------------------------------------------
88 
89  CsvReader() = default;
90  ~CsvReader() = default;
91 
92  CsvReader( CsvReader const& ) = default;
93  CsvReader( CsvReader&& ) = default;
94 
95  CsvReader& operator= ( CsvReader const& ) = default;
96  CsvReader& operator= ( CsvReader&& ) = default;
97 
98  // ---------------------------------------------------------------------
99  // Reading
100  // ---------------------------------------------------------------------
101 
102  table from_stream( std::istream& is ) const;
103  table from_file ( std::string const& fn ) const;
104  table from_string( std::string const& fs ) const;
105 
106  // ---------------------------------------------------------------------
107  // Parsing
108  // ---------------------------------------------------------------------
109 
111  utils::InputStream& input_stream
112  ) const;
113 
114  std::string parse_field(
115  utils::InputStream& input_stream
116  ) const;
117 
118  std::vector<std::string> parse_line(
119  utils::InputStream& input_stream
120  ) const;
121 
122  // ---------------------------------------------------------------------
123  // Properties
124  // ---------------------------------------------------------------------
125 
126  CsvReader& comment_chars( std::string const& chars );
127  std::string const& comment_chars() const;
128 
129  CsvReader& trim_chars( std::string const& chars );
130  std::string const& trim_chars() const;
131 
132  CsvReader& quotation_chars( std::string const& chars );
133  std::string const& quotation_chars() const;
134 
135  CsvReader& separator_chars( std::string const& chars );
136  std::string const& separator_chars() const;
137 
138  CsvReader& skip_empty_lines( bool value );
139  bool skip_empty_lines() const;
140 
141  CsvReader& merge_separators( bool value );
142  bool merge_separators() const;
143 
144  CsvReader& use_escapes( bool value );
145  bool use_escapes() const;
146 
147  CsvReader& use_twin_quotes( bool value );
148  bool use_twin_quotes() const;
149 
150  // ---------------------------------------------------------------------
151  // Members
152  // ---------------------------------------------------------------------
153 
154 private:
155 
156  // We store the following char sets as strings and use find() to check whether a given char
157  // is part of the sets. This is linear in length of the string. As there are usually just a
158  // few chars in there, this is fast. We also tested with a char lookup table, which offers
159  // constant time, but still was slower. See also http://stackoverflow.com/a/29068727/4184258
160  std::string comment_chars_ = "";
161  std::string trim_chars_ = "";
162  std::string quotation_chars_ = "\"";
163  std::string separator_chars_ = ",";
164 
165  bool skip_empty_lines_ = false;
166  bool merge_separators_ = false;
167  bool use_escapes_ = false;
168  bool use_twin_quotes_ = true;
169 
170  // We use a buffer in order to make copying and resizing strings rare and hence fast.
171  // This buffer will grow for bigger csv input fields (but never shrink). We then copy from it,
172  // so that the new strings are as small as possible. After some fields, the buffer size
173  // approaches a value where it rarely needs to grow any more. Speedup (using a 4MB file): ~20%.
174  mutable std::string buffer_;
175 
176 };
177 
178 } // namespace utils
179 } // namespace genesis
180 
181 #endif // include guard
table from_string(std::string const &fs) const
Read a string in CSV format and return its contents.
bool use_twin_quotes() const
Return whether to interpret two consequtive quotation marks as a single ("escaped") one...
table parse_document(utils::InputStream &input_stream) const
Parse a whole CSV document and return its contents.
std::string parse_field(utils::InputStream &input_stream) const
Parse one field (i.e., one cell) of the CSV data and return it.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
table from_stream(std::istream &is) const
Read CSV data until the end of the stream is reached, and return it.
bool merge_separators() const
Return the current setting whether consecutive separators are merged or not.
std::string const & quotation_chars() const
Return the currently set chars for quoting strings in fields.
Read Comma Separated Values (CSV) data and other delimiter-separated formats.
bool skip_empty_lines() const
Return whether currently empty lines are skipped.
table from_file(std::string const &fn) const
Read a CSV file and return its contents.
CsvReader & operator=(CsvReader const &)=default
std::string const & trim_chars() const
Return the currently set chars that are trimmed from the start and end of each field.
std::string const & comment_chars() const
Return the currently set chars that are used to mark comment lines.
bool use_escapes() const
Return whether backslash escape sequences are used.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
std::string const & separator_chars() const
Return the currently set chars used to separate fields of the CSV data.