A toolkit for working with phylogenetic data.
v0.24.0
taxonomy_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
42 
43 #include <cassert>
44 #include <fstream>
45 #include <sstream>
46 #include <stdexcept>
47 #include <string>
48 #include <vector>
49 
50 namespace genesis {
51 namespace taxonomy {
52 
53 // =================================================================================================
54 // Constructor and Rule of Five
55 // =================================================================================================
56 
58 {
59  csv_reader_.separator_chars( "\t" );
60 }
61 
62 // =================================================================================================
63 // Reading
64 // =================================================================================================
65 
66 void TaxonomyReader::read( std::shared_ptr<utils::BaseInputSource> source, Taxonomy& target ) const
67 {
68  utils::InputStream it( source );
69  parse_document( it, target );
70 }
71 
72 Taxonomy TaxonomyReader::read( std::shared_ptr<utils::BaseInputSource> source ) const
73 {
74  Taxonomy result;
75  read( source, result );
76  return result;
77 }
78 
79 // =================================================================================================
80 // Parsing
81 // =================================================================================================
82 
85  Taxonomy& tax
86 ) const {
87  while( it ) {
88  // Get line as name rank pair.
89  auto line = parse_line( it );
90 
91  if( line.name == "" ) {
92  // Maybe here should be a warning instead of silent skipping...
93  continue;
94  }
95 
96  // Parse the taxopath and add it to the taxonomy.
97  auto& taxon = add_from_taxopath(
98  tax,
99  taxopath_parser_.parse( line.name ),
100  expect_strict_order_
101  );
102 
103  // Set the rank and ID.
104  taxon.rank( line.rank );
105  taxon.id( line.id );
106  }
107 }
108 
111 ) const {
112  // Get the fields of the current line.
113  auto fields = csv_reader_.parse_line( it );
114 
115  // Helper function to find the correct field for a property, or throw if invalid.
116  auto get_field = [&] ( int field_pos, std::string field_name ) {
117  // Check if field is actually "active".
118  if( field_pos < 0 ) {
119  return std::string();
120  }
121 
122  // Cast so that array lookup does not complain.
123  assert( field_pos >= 0 );
124  auto pos = static_cast< size_t >( field_pos );
125 
126  // Check for invalid position.
127  if( pos >= fields.size() ) {
128  throw std::out_of_range(
129  "Invalid position for taxonomy " + field_name + " field while reading. Expect "
130  + field_name + " at position " + std::to_string( pos ) + " (zero-based), "
131  + "but the line only contains " + std::to_string( fields.size() )
132  + " fields at line " + std::to_string( it.line() - 1 ) + "."
133  );
134  }
135 
136  // Return result.
137  assert( pos < fields.size() );
138  return fields[ pos ];
139  };
140 
141  // Read fields from line.
142  Line result;
143  result.name = get_field( name_field_position_, "name" );
144  result.rank = get_field( rank_field_position_, "rank" );
145  result.id = get_field( id_field_position_, "ID" );
146 
147  return result;
148 }
149 
150 // =================================================================================================
151 // Properties
152 // =================================================================================================
153 
155 {
156  return csv_reader_;
157 }
158 
160 {
161  return taxopath_parser_;
162 }
163 
165 {
166  // We could also use size_t instead of int here to avoid setting the value to sub-zero.
167  // However, now we have consistency with the rank field position, which is nicer.
168  if( value < 0 ) {
169  throw std::out_of_range(
170  "Cannot set TaxonomyReader::name_field_position to a value below zero."
171  );
172  }
173  name_field_position_ = value;
174  return *this;
175 }
176 
178 {
179  return name_field_position_;
180 }
181 
183 {
184  rank_field_position_ = value;
185  return *this;
186 }
187 
189 {
190  return rank_field_position_;
191 }
192 
194 {
195  id_field_position_ = value;
196  return *this;
197 }
198 
200 {
201  return id_field_position_;
202 }
203 
205 {
206  expect_strict_order_ = value;
207  return *this;
208 }
209 
211 {
212  return expect_strict_order_;
213 }
214 
215 } // namespace taxonomy
216 } // namespace genesis
void read(std::shared_ptr< utils::BaseInputSource > source, Taxonomy &target) const
Read taxonomy data from an input source, and add the contents to a Taxonomy.
int rank_field_position() const
Get the currently set position of the field in each line where the rank name is located.
CsvReader & separator_chars(std::string const &chars)
Set the chars used to separate fields of the CSV data.
TaxonomyReader()
Default constructor.
utils::CsvReader & csv_reader()
Get the CsvReader used for reading a taxonomy file.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string const & rank() const
Return the rank of this taxon.
Definition: taxon.cpp:145
Store a Taxonomy, i.e., a nested hierarchy of Taxa.
Provides some valuable additions to STD.
void parse_document(utils::InputStream &it, Taxonomy &tax) const
Parse all data from an InputStream into a Taxonomy object.
Provides some commonly used string utility functions.
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
Provides functions for accessing the file system.
int name_field_position() const
Get the currently set position of the field in each line where the taxon name is located.
TaxopathParser & taxopath_parser()
Get the TaxopathParser used for parsing taxonomic path strings.
bool expect_strict_order() const
Return whether currently the reader expects a strict order of taxa.
Taxon & add_from_taxopath(Taxonomy &taxonomy, Taxopath const &taxopath, bool expect_parents)
Add a Taxon to a Taxonomy, using the taxonomic elements of a Taxopath.
Definition: taxopath.cpp:76
Read Comma/Character Separated Values (CSV) data and other delimiter-separated formats.
size_t line() const
Return the current line of the input stream.
Helper class to parse a string containing a taxonomic path string into a Taxopath object...
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
int id_field_position() const
Get the currently set position of the field in each line where the ID is located. ...
Internal helper structure that stores the relevant data of one line while reading.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
Taxopath parse(std::string const &taxopath) const
Parse a taxonomic path string into a Taxopath object and return it.
Line parse_line(utils::InputStream &it) const
Read a single line of a taxonomy file and return the contained name and rank.
Read Taxonomy file formats.