A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ncbi.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
33 #include <cassert>
34 #include <fstream>
35 #include <functional>
36 #include <stdexcept>
37 #include <vector>
38 
39 namespace genesis {
40 namespace taxonomy {
41 
42 // =================================================================================================
43 // NCBI Reading stuff
44 // =================================================================================================
45 
47  utils::CsvReader::Table const& node_table,
48  size_t tax_id_pos,
49  size_t parent_tax_id_pos,
50  size_t rank_pos
51 ) {
52  NcbiNodeLookup result;
53 
54  // Helper to get a field or throw.
55  auto get_field = []( utils::CsvReader::Row const& line, size_t pos, std::string const& field_name ){
56  if( pos >= line.size() ) {
57  throw std::runtime_error(
58  "NCBI node table line does not contain position " + std::to_string( pos ) +
59  " for field " + field_name
60  );
61  }
62 
63  assert( pos < line.size() );
64  return line[ pos ];
65  };
66 
67  // Iterate lines and get all fields into the result lookup table.
68  for( auto const& line : node_table ) {
69  NcbiNode node;
70  node.tax_id = get_field( line, tax_id_pos, "tax_id" );
71  node.parent_tax_id = get_field( line, parent_tax_id_pos, "parent_tax_id" );
72  node.rank = get_field( line, rank_pos, "rank" );
73 
74  // We expect unique entries.
75  if( result.count( node.tax_id ) > 0 ) {
76  throw std::runtime_error( "Multiple entries for NCBI node with tax_id " + node.tax_id );
77  }
78 
79  result[ node.tax_id ] = node;
80  }
81 
82  return result;
83 }
84 
86  utils::CsvReader::Table const& name_table,
87  size_t tax_id_pos,
88  size_t name_pos,
89  size_t name_class_pos,
90  std::string name_class_filter
91 ) {
92  NcbiNameLookup result;
93 
94  // Helper to get a field or throw.
95  auto get_field = []( utils::CsvReader::Row const& line, size_t pos, std::string const& field_name ){
96  if( pos >= line.size() ) {
97  throw std::runtime_error(
98  "NCBI name table line does not contain position " + std::to_string( pos ) +
99  " for field " + field_name
100  );
101  }
102 
103  assert( pos < line.size() );
104  return line[ pos ];
105  };
106 
107  // Iterate lines and get all fields into the result lookup table.
108  for( auto const& line : name_table ) {
109  NcbiName name;
110  name.tax_id = get_field( line, tax_id_pos, "tax_id" );
111  name.name = get_field( line, name_pos, "name" );
112  name.name_class = get_field( line, name_class_pos, "name_class" );
113 
114  // Do not add if the name class does not fit.
115  if( name.name_class != name_class_filter ) {
116  continue;
117  }
118 
119  // We expect unique entries.
120  if( result.count( name.tax_id ) > 0 ) {
121  throw std::runtime_error( "Multiple entries for NCBI name with tax_id " + name.tax_id );
122  }
123 
124  result[ name.tax_id ] = name;
125  }
126 
127  return result;
128 }
129 
131  NcbiNodeLookup const& nodes,
132  NcbiNameLookup const& names
133 ) {
134  Taxonomy result;
135 
136  // Recursive function that addes a taxon to the taxonomy, and its parents first, if needed.
137  std::function<void( NcbiNode const& node )> add_taxon = [&]( NcbiNode const& node ){
138 
139  // Due to the recurson, it can happen that we already added the node before.
140  // In that case, we can simply skip.
141  if( node.taxon != nullptr ) {
142  return;
143  }
144 
145  // Safety
146  if( nodes.count( node.parent_tax_id ) == 0 ) {
147  throw std::runtime_error(
148  "Cannot find parent tax_id " + node.parent_tax_id + " for node " +
149  node.tax_id + " in the NCBI nodes."
150  );
151  }
152 
153  // Get the parent. We need it a few times, so only to the lookup once.
154  auto& parent_node = nodes.at( node.parent_tax_id );
155 
156  // We have two bases cases: either the immediate parent already exists in the taxonomy,
157  // or is the root. In both cases, we can simply add the taxon to this parent and are done.
158  Taxonomy* parent_tax = nullptr;
159 
160  // Base case: the parent exists.
161  if( parent_node.taxon != nullptr ) {
162  parent_tax = parent_node.taxon;
163  }
164 
165  // Base case: the parent is the root.
166  if( parent_node.tax_id == node.tax_id ) {
167  parent_tax = &result;
168  }
169 
170  // If we didn't find the parent, we have to recurse and create it first.
171  if( ! parent_tax ) {
172  add_taxon( parent_node );
173 
174  // Now, the parent has a taxon assigned, so use it.
175  assert( parent_node.taxon != nullptr );
176  parent_tax = parent_node.taxon;
177  }
178 
179  // Now we are sure that we have a valid parent.
180  assert( parent_tax );
181 
182  // Get the name of this taxon.
183  if( names.count( node.tax_id ) == 0 ) {
184  throw std::runtime_error( "No name found for tax_id " + node.tax_id );
185  }
186  auto const& name = names.at( node.tax_id ).name;
187 
188  // Add the taxon to the parent.
189  // We skip the check for taxa with the same name, as otherwise the function becomes way too
190  // slow. We trust the NCBI taxonomy just enough to think that there are no duplicates.
191  auto& added = parent_tax->add_child( name, false );
192  added.rank( node.rank );
193  added.id( node.tax_id );
194 
195  // Done. Store the taxon in the lookup for later.
196  node.taxon = &added;
197  };
198 
199  // Add all taxa to the taxonomy.
200  for( auto const& node_it : nodes ) {
201  auto const& node = node_it.second;
202 
203  // Add the taxon, recursively if needed.
204  add_taxon( node );
205  }
206 
207  return result;
208 }
209 
210 Taxonomy read_ncbi_taxonomy( std::string const& node_file, std::string const& name_file )
211 {
212  // Prepare a reader for the stupid NCBI table specifications.
213  // Why can't they use normal csv files like everyone else?
214  auto reader = utils::CsvReader();
215  reader.separator_chars( "|" );
216  reader.trim_chars( "\t" );
217  reader.quotation_chars( "" );
218 
219  // Read data into lookup tables.
220  auto const nodes = convert_ncbi_node_table( reader.from_file( node_file ));
221  auto const names = convert_ncbi_name_table( reader.from_file( name_file ));
222 
223  // Do the table untangling.
224  return convert_ncbi_tables( nodes, names );
225 }
226 
227 } // namespace taxonomy
228 } // namespace genesis
std::string name_class
Definition: ncbi.hpp:61
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:381
Store a Taxonomy, i.e., a nested hierarchy of Taxa.
std::unordered_map< std::string, NcbiName > NcbiNameLookup
Definition: ncbi.hpp:65
NcbiNameLookup convert_ncbi_name_table(utils::CsvReader::Table const &name_table, size_t tax_id_pos, size_t name_pos, size_t name_class_pos, std::string name_class_filter)
Definition: ncbi.cpp:85
Taxonomy read_ncbi_taxonomy(std::string const &node_file, std::string const &name_file)
Definition: ncbi.cpp:210
Taxon & add_child(Taxon const &child, bool merge_duplicates=true)
Add a child Taxon as a copy of a given Taxon and return it.
Definition: taxonomy.cpp:182
NcbiNodeLookup convert_ncbi_node_table(utils::CsvReader::Table const &node_table, size_t tax_id_pos, size_t parent_tax_id_pos, size_t rank_pos)
Definition: ncbi.cpp:46
std::unordered_map< std::string, NcbiNode > NcbiNodeLookup
Definition: ncbi.hpp:64
Read Comma Separated Values (CSV) data and other delimiter-separated formats.
std::string parent_tax_id
Definition: ncbi.hpp:51
Taxonomy convert_ncbi_tables(NcbiNodeLookup const &nodes, NcbiNameLookup const &names)
Definition: ncbi.cpp:130
std::string const & rank() const
Return the rank of this taxon.
Definition: taxon.cpp:145