A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
taxonomy_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
42 
43 #include <assert.h>
44 #include <fstream>
45 #include <sstream>
46 #include <stdexcept>
47 #include <vector>
48 
49 namespace genesis {
50 namespace taxonomy {
51 
52 // =================================================================================================
53 // Constructor and Rule of Five
54 // =================================================================================================
55 
62 {
63  csv_reader_.separator_chars( "\t" );
64 }
65 
66 // =================================================================================================
67 // Reading
68 // =================================================================================================
69 
74 void TaxonomyReader::from_stream( std::istream& is, Taxonomy& tax ) const
75 {
76  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( is ));
77  parse_document( it, tax );
78 }
79 
83 void TaxonomyReader::from_file( std::string const& fn, Taxonomy& tax ) const
84 {
85  utils::InputStream it( utils::make_unique< utils::FileInputSource >( fn ));
86  parse_document( it, tax );
87 }
88 
92 void TaxonomyReader::from_string( std::string const& is, Taxonomy& tax ) const
93 {
94  utils::InputStream it( utils::make_unique< utils::StringInputSource >( is ));
95  parse_document( it, tax );
96 }
97 
98 // =================================================================================================
99 // Parsing
100 // =================================================================================================
101 
106  utils::InputStream& it,
107  Taxonomy& tax
108 ) const {
109  while( it ) {
110  // Get line as name rank pair.
111  auto line = parse_line( it );
112 
113  if( line.name == "" ) {
114  // Maybe here should be a warning instead of silent skipping...
115  continue;
116  }
117 
118  // Parse the taxopath and add it to the taxonomy.
119  auto& taxon = add_from_taxopath(
120  tax,
121  taxopath_parser_.from_string( line.name ),
122  expect_strict_order_
123  );
124 
125  // Set the rank.
126  taxon.rank( line.rank );
127  }
128 }
129 
137 ) const {
138  // Get the fields of the current line.
139  auto fields = csv_reader_.parse_line( it );
140 
141  // Helper function to find the correct field for a property, or throw if invalid.
142  auto get_field = [&] ( int field_pos, std::string field_name ) {
143  // Check if field is actually "active".
144  if( field_pos < 0 ) {
145  return std::string();
146  }
147 
148  // Cast so that array lookup does not complain.
149  assert( field_pos >= 0 );
150  auto pos = static_cast< size_t >( field_pos );
151 
152  // Check for invalid position.
153  if( pos >= fields.size() ) {
154  throw std::out_of_range(
155  "Invalid position for taxonomy " + field_name + " field while reading. Expect "
156  + field_name + " at position " + utils::to_string( pos ) + " (zero-based), "
157  + "but the line only contains " + utils::to_string( fields.size() )
158  + " fields at line " + utils::to_string( it.line() - 1 ) + "."
159  );
160  }
161 
162  // Return result.
163  assert( pos < fields.size() );
164  return fields[ pos ];
165  };
166 
167  // Read fields from line.
168  Line result;
169  result.name = get_field( name_field_position_, "name" );
170  result.rank = get_field( rank_field_position_, "rank" );
171 
172  return result;
173 }
174 
175 // =================================================================================================
176 // Properties
177 // =================================================================================================
178 
192 {
193  return csv_reader_;
194 }
195 
204 {
205  return taxopath_parser_;
206 }
207 
227 {
228  // We could also use size_t instead of int here to avoid setting the value to sub-zero.
229  // However, now we have consistency with the rank field position, which is nicer.
230  if( value < 0 ) {
231  throw std::out_of_range(
232  "Cannot set TaxonomyReader::name_field_position to a value below zero."
233  );
234  }
235  name_field_position_ = value;
236  return *this;
237 }
238 
245 {
246  return name_field_position_;
247 }
248 
264 {
265  rank_field_position_ = value;
266  return *this;
267 }
268 
275 {
276  return rank_field_position_;
277 }
278 
301 {
302  expect_strict_order_ = value;
303  return *this;
304 }
305 
312 {
313  return expect_strict_order_;
314 }
315 
316 } // namespace taxonomy
317 } // namespace genesis
void parse_document(utils::InputStream &it, Taxonomy &tax) const
Parse all data from an InputStream into a Taxonomy object.
int name_field_position() const
Get the currently set position of the field in each line where the taxon name is located.
TaxonomyReader()
Default constructor.
void from_string(std::string const &is, Taxonomy &tax) const
Read a string with taxonomy data and add its contents to a Taxonomy.
CsvReader & separator_chars(std::string const &chars)
Set the chars used to separate fields of the CSV data.
utils::CsvReader & csv_reader()
Get the CsvReader used for reading a taxonomy file.
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:300
Store a Taxonomy, i.e., a nested hierarchy of Taxa.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
Provides some valuable additions to STD.
Provides some commonly used string utility functions.
Provides functions for accessing the file system.
TaxopathParser & taxopath_parser()
Get the TaxopathParser used for parsing taxonomic path strings.
Taxon & add_from_taxopath(Taxonomy &taxonomy, Taxopath const &taxopath, bool expect_parents)
Add a Taxon to a Taxonomy, using the taxonomic elements of a Taxopath.
Read Comma Separated Values (CSV) data and other delimiter-separated formats.
Helper class to parse a string containing a taxonomic path string into a Taxopath object...
Taxopath from_string(std::string const &taxopath) const
Parse a taxonomic path string into a Taxopath object and return it.
void from_file(std::string const &fn, Taxonomy &tax) const
Read a taxonomy file and add its contents to a Taxonomy.
size_t line() const
Return the current line of the input stream.
Line parse_line(utils::InputStream &it) const
Read a single line of a taxonomy file and return the contained name and rank.
bool expect_strict_order() const
Return whether currently the reader expects a strict order of taxa.
void from_stream(std::istream &is, Taxonomy &tax) const
Read taxonomy data until the end of the stream is reached, and add the contents to a Taxonomy...
std::string const & rank() const
Return the rank of this taxon.
Definition: taxon.cpp:189
Internal helper structure that stores the relevant data of one line while reading.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
Read Taxonomy file formats.