A library for working with phylogenetic and population genetic data.
v0.27.0
taxonomy_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
42 
43 #include <cassert>
44 #include <fstream>
45 #include <sstream>
46 #include <stdexcept>
47 #include <string>
48 #include <vector>
49 
50 namespace genesis {
51 namespace taxonomy {
52 
53 // =================================================================================================
54 // Constructor and Rule of Five
55 // =================================================================================================
56 
58 {
59  csv_reader_.separator_chars( "\t" );
60 }
61 
62 // =================================================================================================
63 // Reading
64 // =================================================================================================
65 
66 void TaxonomyReader::read( std::shared_ptr<utils::BaseInputSource> source, Taxonomy& target ) const
67 {
68  utils::InputStream it( source );
69  parse_document( it, target );
70 }
71 
72 Taxonomy TaxonomyReader::read( std::shared_ptr<utils::BaseInputSource> source ) const
73 {
74  Taxonomy result;
75  read( source, result );
76  return result;
77 }
78 
79 // =================================================================================================
80 // Parsing
81 // =================================================================================================
82 
85  Taxonomy& tax
86 ) const {
87  while( it ) {
88  // Get line as name rank pair.
89  auto line = parse_line( it );
90 
91  if( line.name == "" ) {
92  // Maybe here should be a warning instead of silent skipping...
93  continue;
94  }
95 
96  // Parse the taxopath and add it to the taxonomy.
97  auto& taxon = add_from_taxopath(
98  tax,
99  taxopath_parser_.parse( line.name ),
100  expect_strict_order_
101  );
102 
103  // Set the rank and ID.
104  taxon.rank( line.rank );
105  taxon.id( line.id );
106  }
107 }
108 
111 ) const {
112  // Get the fields of the current line.
113  auto fields = csv_reader_.parse_line( it );
114 
115  // Helper function to find the correct field for a property, or throw if invalid.
116  auto get_field = [&] ( int field_pos, std::string field_name ) {
117  // Check if field is actually "active".
118  if( field_pos < 0 ) {
119  return std::string();
120  }
121 
122  // Cast so that array lookup does not complain.
123  assert( field_pos >= 0 );
124  auto pos = static_cast< size_t >( field_pos );
125 
126  // Check for invalid position.
127  if( pos >= fields.size() ) {
128  throw std::out_of_range(
129  "Invalid position for taxonomy " + field_name + " field while reading. Expect "
130  + field_name + " at position " + std::to_string( pos ) + " (zero-based), "
131  + "but the line only contains " + std::to_string( fields.size() )
132  + " fields at line " + std::to_string( it.line() - 1 ) + "."
133  );
134  }
135 
136  // Return result.
137  assert( pos < fields.size() );
138  return fields[ pos ];
139  };
140 
141  // Read fields from line.
142  Line result;
143  result.name = get_field( name_field_position_, "name" );
144  result.rank = get_field( rank_field_position_, "rank" );
145  result.id = get_field( id_field_position_, "ID" );
146 
147  return result;
148 }
149 
150 // =================================================================================================
151 // Properties
152 // =================================================================================================
153 
155 {
156  return csv_reader_;
157 }
158 
160 {
161  return taxopath_parser_;
162 }
163 
165 {
166  // We could also use size_t instead of int here to avoid setting the value to sub-zero.
167  // However, now we have consistency with the rank field position, which is nicer.
168  if( value < 0 ) {
169  throw std::out_of_range(
170  "Cannot set TaxonomyReader::name_field_position to a value below zero."
171  );
172  }
173  name_field_position_ = value;
174  return *this;
175 }
176 
178 {
179  return name_field_position_;
180 }
181 
183 {
184  rank_field_position_ = value;
185  return *this;
186 }
187 
189 {
190  return rank_field_position_;
191 }
192 
194 {
195  id_field_position_ = value;
196  return *this;
197 }
198 
200 {
201  return id_field_position_;
202 }
203 
205 {
206  expect_strict_order_ = value;
207  return *this;
208 }
209 
211 {
212  return expect_strict_order_;
213 }
214 
215 } // namespace taxonomy
216 } // namespace genesis
genesis::taxonomy::TaxonomyReader::read
void read(std::shared_ptr< utils::BaseInputSource > source, Taxonomy &target) const
Read taxonomy data from an input source, and add the contents to a Taxonomy.
Definition: taxonomy_reader.cpp:66
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
genesis::utils::CsvReader::parse_line
std::vector< std::string > parse_line(utils::InputStream &input_stream) const
Parse one line of the CSV data and return it.
Definition: utils/formats/csv/reader.cpp:160
genesis::utils::CsvReader::separator_chars
CsvReader & separator_chars(std::string const &chars)
Set the chars used to separate fields of the CSV data.
Definition: utils/formats/csv/reader.hpp:259
genesis::taxonomy::TaxonomyReader::expect_strict_order
bool expect_strict_order() const
Return whether currently the reader expects a strict order of taxa.
Definition: taxonomy_reader.cpp:210
taxopath.hpp
fs.hpp
Provides functions for accessing the file system.
genesis::taxonomy::TaxonomyReader
Read Taxonomy file formats.
Definition: taxonomy_reader.hpp:108
genesis::taxonomy::TaxonomyReader::Line
Internal helper structure that stores the relevant data of one line while reading.
Definition: taxonomy_reader.hpp:119
genesis::taxonomy::TaxonomyReader::name_field_position
int name_field_position() const
Get the currently set position of the field in each line where the taxon name is located.
Definition: taxonomy_reader.cpp:177
std.hpp
Provides some valuable additions to STD.
taxonomy.hpp
genesis::taxonomy::TaxonomyReader::id_field_position
int id_field_position() const
Get the currently set position of the field in each line where the ID is located.
Definition: taxonomy_reader.cpp:199
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis::taxonomy::TaxonomyReader::Line::id
std::string id
Definition: taxonomy_reader.hpp:123
genesis::taxonomy::TaxopathParser::parse
Taxopath parse(std::string const &taxopath) const
Parse a taxonomic path string into a Taxopath object and return it.
Definition: taxopath_parser.cpp:48
genesis::taxonomy::TaxonomyReader::Line::rank
std::string rank
Definition: taxonomy_reader.hpp:122
genesis::taxonomy::TaxonomyReader::TaxonomyReader
TaxonomyReader()
Default constructor.
Definition: taxonomy_reader.cpp:57
genesis::taxonomy::TaxonomyReader::rank_field_position
int rank_field_position() const
Get the currently set position of the field in each line where the rank name is located.
Definition: taxonomy_reader.cpp:188
taxopath.hpp
genesis::taxonomy::TaxonomyReader::csv_reader
utils::CsvReader & csv_reader()
Get the CsvReader used for reading a taxonomy file.
Definition: taxonomy_reader.cpp:154
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::taxonomy::add_from_taxopath
Taxon & add_from_taxopath(Taxonomy &taxonomy, Taxopath const &taxopath, bool expect_parents)
Add a Taxon to a Taxonomy, using the taxonomic elements of a Taxopath.
Definition: taxopath.cpp:76
genesis::taxonomy::Taxonomy
Store a Taxonomy, i.e., a nested hierarchy of Taxa.
Definition: taxonomy/taxonomy.hpp:96
taxon.hpp
genesis::taxonomy::TaxonomyReader::Line::name
std::string name
Definition: taxonomy_reader.hpp:121
genesis::utils::InputStream::line
size_t line() const
Return the current line of the input stream.
Definition: input_stream.hpp:461
genesis::taxonomy::TaxonomyReader::parse_line
Line parse_line(utils::InputStream &it) const
Read a single line of a taxonomy file and return the contained name and rank.
Definition: taxonomy_reader.cpp:109
genesis::taxonomy::TaxonomyReader::parse_document
void parse_document(utils::InputStream &it, Taxonomy &tax) const
Parse all data from an InputStream into a Taxonomy object.
Definition: taxonomy_reader.cpp:83
genesis::taxonomy::TaxopathParser
Helper class to parse a string containing a taxonomic path string into a Taxopath object.
Definition: taxopath_parser.hpp:81
taxonomy_reader.hpp
genesis::taxonomy::TaxonomyReader::taxopath_parser
TaxopathParser & taxopath_parser()
Get the TaxopathParser used for parsing taxonomic path strings.
Definition: taxonomy_reader.cpp:159
genesis::utils::CsvReader
Read Comma/Character Separated Values (CSV) data and other delimiter-separated formats.
Definition: utils/formats/csv/reader.hpp:70