A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
taxonomy_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
42 
43 #include <assert.h>
44 #include <fstream>
45 #include <sstream>
46 #include <stdexcept>
47 #include <vector>
48 
49 namespace genesis {
50 namespace taxonomy {
51 
52 // =================================================================================================
53 // Constructor and Rule of Five
54 // =================================================================================================
55 
57 {
58  csv_reader_.separator_chars( "\t" );
59 }
60 
61 // =================================================================================================
62 // Reading
63 // =================================================================================================
64 
65 void TaxonomyReader::from_stream( std::istream& is, Taxonomy& tax ) const
66 {
67  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( is ));
68  parse_document( it, tax );
69 }
70 
71 void TaxonomyReader::from_file( std::string const& fn, Taxonomy& tax ) const
72 {
73  utils::InputStream it( utils::make_unique< utils::FileInputSource >( fn ));
74  parse_document( it, tax );
75 }
76 
77 void TaxonomyReader::from_string( std::string const& is, Taxonomy& tax ) const
78 {
79  utils::InputStream it( utils::make_unique< utils::StringInputSource >( is ));
80  parse_document( it, tax );
81 }
82 
83 Taxonomy TaxonomyReader::from_stream( std::istream& is ) const
84 {
85  Taxonomy res;
86  from_stream( is, res );
87  return res;
88 }
89 
90 Taxonomy TaxonomyReader::from_file( std::string const& fn ) const
91 {
92  Taxonomy res;
93  from_file( fn, res );
94  return res;
95 }
96 
97 Taxonomy TaxonomyReader::from_string( std::string const& is ) const
98 {
99  Taxonomy res;
100  from_string( is, res );
101  return res;
102 }
103 
104 // =================================================================================================
105 // Parsing
106 // =================================================================================================
107 
109  utils::InputStream& it,
110  Taxonomy& tax
111 ) const {
112  while( it ) {
113  // Get line as name rank pair.
114  auto line = parse_line( it );
115 
116  if( line.name == "" ) {
117  // Maybe here should be a warning instead of silent skipping...
118  continue;
119  }
120 
121  // Parse the taxopath and add it to the taxonomy.
122  auto& taxon = add_from_taxopath(
123  tax,
124  taxopath_parser_.from_string( line.name ),
125  expect_strict_order_
126  );
127 
128  // Set the rank and ID.
129  taxon.rank( line.rank );
130  taxon.id( line.id );
131  }
132 }
133 
136 ) const {
137  // Get the fields of the current line.
138  auto fields = csv_reader_.parse_line( it );
139 
140  // Helper function to find the correct field for a property, or throw if invalid.
141  auto get_field = [&] ( int field_pos, std::string field_name ) {
142  // Check if field is actually "active".
143  if( field_pos < 0 ) {
144  return std::string();
145  }
146 
147  // Cast so that array lookup does not complain.
148  assert( field_pos >= 0 );
149  auto pos = static_cast< size_t >( field_pos );
150 
151  // Check for invalid position.
152  if( pos >= fields.size() ) {
153  throw std::out_of_range(
154  "Invalid position for taxonomy " + field_name + " field while reading. Expect "
155  + field_name + " at position " + utils::to_string( pos ) + " (zero-based), "
156  + "but the line only contains " + utils::to_string( fields.size() )
157  + " fields at line " + utils::to_string( it.line() - 1 ) + "."
158  );
159  }
160 
161  // Return result.
162  assert( pos < fields.size() );
163  return fields[ pos ];
164  };
165 
166  // Read fields from line.
167  Line result;
168  result.name = get_field( name_field_position_, "name" );
169  result.rank = get_field( rank_field_position_, "rank" );
170  result.id = get_field( id_field_position_, "ID" );
171 
172  return result;
173 }
174 
175 // =================================================================================================
176 // Properties
177 // =================================================================================================
178 
180 {
181  return csv_reader_;
182 }
183 
185 {
186  return taxopath_parser_;
187 }
188 
190 {
191  // We could also use size_t instead of int here to avoid setting the value to sub-zero.
192  // However, now we have consistency with the rank field position, which is nicer.
193  if( value < 0 ) {
194  throw std::out_of_range(
195  "Cannot set TaxonomyReader::name_field_position to a value below zero."
196  );
197  }
198  name_field_position_ = value;
199  return *this;
200 }
201 
203 {
204  return name_field_position_;
205 }
206 
208 {
209  rank_field_position_ = value;
210  return *this;
211 }
212 
214 {
215  return rank_field_position_;
216 }
217 
219 {
220  id_field_position_ = value;
221  return *this;
222 }
223 
225 {
226  return id_field_position_;
227 }
228 
230 {
231  expect_strict_order_ = value;
232  return *this;
233 }
234 
236 {
237  return expect_strict_order_;
238 }
239 
240 } // namespace taxonomy
241 } // namespace genesis
void parse_document(utils::InputStream &it, Taxonomy &tax) const
Parse all data from an InputStream into a Taxonomy object.
int name_field_position() const
Get the currently set position of the field in each line where the taxon name is located.
TaxonomyReader()
Default constructor.
void from_string(std::string const &is, Taxonomy &tax) const
Read a string with taxonomy data and add its contents to a Taxonomy.
CsvReader & separator_chars(std::string const &chars)
Set the chars used to separate fields of the CSV data.
utils::CsvReader & csv_reader()
Get the CsvReader used for reading a taxonomy file.
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:381
Store a Taxonomy, i.e., a nested hierarchy of Taxa.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
int rank_field_position() const
Get the currently set position of the field in each line where the rank name is located.
Provides some valuable additions to STD.
Provides some commonly used string utility functions.
Provides functions for accessing the file system.
TaxopathParser & taxopath_parser()
Get the TaxopathParser used for parsing taxonomic path strings.
Taxon & add_from_taxopath(Taxonomy &taxonomy, Taxopath const &taxopath, bool expect_parents)
Add a Taxon to a Taxonomy, using the taxonomic elements of a Taxopath.
Read Comma Separated Values (CSV) data and other delimiter-separated formats.
Helper class to parse a string containing a taxonomic path string into a Taxopath object...
Taxopath from_string(std::string const &taxopath) const
Parse a taxonomic path string into a Taxopath object and return it.
void from_file(std::string const &fn, Taxonomy &tax) const
Read a taxonomy file and add its contents to a Taxonomy.
size_t line() const
Return the current line of the input stream.
Line parse_line(utils::InputStream &it) const
Read a single line of a taxonomy file and return the contained name and rank.
bool expect_strict_order() const
Return whether currently the reader expects a strict order of taxa.
void from_stream(std::istream &is, Taxonomy &tax) const
Read taxonomy data until the end of the stream is reached, and add the contents to a Taxonomy...
std::string const & rank() const
Return the rank of this taxon.
Definition: taxon.cpp:145
Internal helper structure that stores the relevant data of one line while reading.
int id_field_position() const
Get the currently set position of the field in each line where the ID is located. ...
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
Read Taxonomy file formats.