A toolkit for working with phylogenetic data.
v0.24.0
ncbi.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 
35 #include <cassert>
36 #include <fstream>
37 #include <functional>
38 #include <stdexcept>
39 #include <vector>
40 
41 namespace genesis {
42 namespace taxonomy {
43 
44 // =================================================================================================
45 // NCBI Reading stuff
46 // =================================================================================================
47 
49  utils::CsvReader::Table const& node_table,
50  size_t tax_id_pos,
51  size_t parent_tax_id_pos,
52  size_t rank_pos
53 ) {
54  NcbiNodeLookup result;
55 
56  // Helper to get a field or throw.
57  auto get_field = []( utils::CsvReader::Line const& line, size_t pos, std::string const& field_name ){
58  if( pos >= line.size() ) {
59  throw std::runtime_error(
60  "NCBI node table line does not contain position " + std::to_string( pos ) +
61  " for field " + field_name
62  );
63  }
64 
65  assert( pos < line.size() );
66  return line[ pos ];
67  };
68 
69  // Iterate lines and get all fields into the result lookup table.
70  for( auto const& line : node_table ) {
71  NcbiNode node;
72  node.tax_id = get_field( line, tax_id_pos, "tax_id" );
73  node.parent_tax_id = get_field( line, parent_tax_id_pos, "parent_tax_id" );
74  node.rank = get_field( line, rank_pos, "rank" );
75 
76  // We expect unique entries.
77  if( result.count( node.tax_id ) > 0 ) {
78  throw std::runtime_error( "Multiple entries for NCBI node with tax_id " + node.tax_id );
79  }
80 
81  result[ node.tax_id ] = node;
82  }
83 
84  return result;
85 }
86 
88  utils::CsvReader::Table const& name_table,
89  size_t tax_id_pos,
90  size_t name_pos,
91  size_t name_class_pos,
92  std::string const& name_class_filter
93 ) {
94  NcbiNameLookup result;
95 
96  // Helper to get a field or throw.
97  auto get_field = []( utils::CsvReader::Line const& line, size_t pos, std::string const& field_name ){
98  if( pos >= line.size() ) {
99  throw std::runtime_error(
100  "NCBI name table line does not contain position " + std::to_string( pos ) +
101  " for field " + field_name
102  );
103  }
104 
105  assert( pos < line.size() );
106  return line[ pos ];
107  };
108 
109  // Iterate lines and get all fields into the result lookup table.
110  for( auto const& line : name_table ) {
111  NcbiName name;
112  name.tax_id = get_field( line, tax_id_pos, "tax_id" );
113  name.name = get_field( line, name_pos, "name" );
114  name.name_class = get_field( line, name_class_pos, "name_class" );
115 
116  // Do not add if the name class does not fit.
117  if( name.name_class != name_class_filter ) {
118  continue;
119  }
120 
121  // We expect unique entries.
122  if( result.count( name.tax_id ) > 0 ) {
123  throw std::runtime_error( "Multiple entries for NCBI name with tax_id " + name.tax_id );
124  }
125 
126  result[ name.tax_id ] = name;
127  }
128 
129  return result;
130 }
131 
133  NcbiNodeLookup const& nodes,
134  NcbiNameLookup const& names
135 ) {
136  Taxonomy result;
137 
138  // Recursive function that addes a taxon to the taxonomy, and its parents first, if needed.
139  std::function<void( NcbiNode const& node )> add_taxon = [&]( NcbiNode const& node ){
140 
141  // Due to the recurson, it can happen that we already added the node before.
142  // In that case, we can simply skip.
143  if( node.taxon != nullptr ) {
144  return;
145  }
146 
147  // Safety
148  if( nodes.count( node.parent_tax_id ) == 0 ) {
149  throw std::runtime_error(
150  "Cannot find parent tax_id " + node.parent_tax_id + " for node " +
151  node.tax_id + " in the NCBI nodes."
152  );
153  }
154 
155  // Get the parent. We need it a few times, so only to the lookup once.
156  auto& parent_node = nodes.at( node.parent_tax_id );
157 
158  // We have two bases cases: either the immediate parent already exists in the taxonomy,
159  // or is the root. In both cases, we can simply add the taxon to this parent and are done.
160  Taxonomy* parent_tax = nullptr;
161 
162  // Base case: the parent exists.
163  if( parent_node.taxon != nullptr ) {
164  parent_tax = parent_node.taxon;
165  }
166 
167  // Base case: the parent is the root.
168  if( parent_node.tax_id == node.tax_id ) {
169  parent_tax = &result;
170  }
171 
172  // If we didn't find the parent, we have to recurse and create it first.
173  if( ! parent_tax ) {
174  add_taxon( parent_node );
175 
176  // Now, the parent has a taxon assigned, so use it.
177  assert( parent_node.taxon != nullptr );
178  parent_tax = parent_node.taxon;
179  }
180 
181  // Now we are sure that we have a valid parent.
182  assert( parent_tax );
183 
184  // Get the name of this taxon.
185  if( names.count( node.tax_id ) == 0 ) {
186  throw std::runtime_error( "No name found for tax_id " + node.tax_id );
187  }
188  auto const& name = names.at( node.tax_id ).name;
189 
190  // Add the taxon to the parent.
191  // We skip the check for taxa with the same name, as otherwise the function becomes way too
192  // slow. We trust the NCBI taxonomy just enough to think that there are no duplicates.
193  auto& added = parent_tax->add_child( name, false );
194  added.rank( node.rank );
195  added.id( node.tax_id );
196 
197  // Done. Store the taxon in the lookup for later.
198  node.taxon = &added;
199  };
200 
201  // Add all taxa to the taxonomy.
202  for( auto const& node_it : nodes ) {
203  auto const& node = node_it.second;
204 
205  // Add the taxon, recursively if needed.
206  add_taxon( node );
207  }
208 
209  return result;
210 }
211 
212 Taxonomy read_ncbi_taxonomy( std::string const& node_file, std::string const& name_file )
213 {
214  // Prepare a reader for the stupid NCBI table specifications.
215  // Why can't they use normal csv files like everyone else?
216  auto reader = utils::CsvReader();
217  reader.separator_chars( "|" );
218  reader.trim_chars( "\t" );
219  reader.quotation_chars( "" );
220 
221  // Read data into lookup tables.
222  auto const nodes = convert_ncbi_node_table( reader.read( utils::from_file( node_file )));
223  auto const names = convert_ncbi_name_table( reader.read( utils::from_file( name_file )));
224 
225  // Do the table untangling.
226  return convert_ncbi_tables( nodes, names );
227 }
228 
229 } // namespace taxonomy
230 } // namespace genesis
std::string name_class
Definition: ncbi.hpp:61
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string const & rank() const
Return the rank of this taxon.
Definition: taxon.cpp:145
Store a Taxonomy, i.e., a nested hierarchy of Taxa.
std::unordered_map< std::string, NcbiName > NcbiNameLookup
Definition: ncbi.hpp:65
Taxonomy read_ncbi_taxonomy(std::string const &node_file, std::string const &name_file)
Definition: ncbi.cpp:212
Taxon & add_child(Taxon const &child, bool merge_duplicates=true)
Add a child Taxon as a copy of a given Taxon and return it.
Definition: taxonomy.cpp:182
NcbiNodeLookup convert_ncbi_node_table(utils::CsvReader::Table const &node_table, size_t tax_id_pos, size_t parent_tax_id_pos, size_t rank_pos)
Definition: ncbi.cpp:48
std::unordered_map< std::string, NcbiNode > NcbiNodeLookup
Definition: ncbi.hpp:64
NcbiNameLookup convert_ncbi_name_table(utils::CsvReader::Table const &name_table, size_t tax_id_pos, size_t name_pos, size_t name_class_pos, std::string const &name_class_filter)
Definition: ncbi.cpp:87
std::shared_ptr< BaseInputSource > from_file(std::string const &file_name, bool detect_compression=true)
Obtain an input source for reading from a file.
Read Comma/Character Separated Values (CSV) data and other delimiter-separated formats.
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
std::string parent_tax_id
Definition: ncbi.hpp:51
Taxonomy convert_ncbi_tables(NcbiNodeLookup const &nodes, NcbiNameLookup const &names)
Definition: ncbi.cpp:132