A library for working with phylogenetic and population genetic data.
v0.32.0
tree/formats/table/reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
37 
38 #include <cassert>
39 #include <unordered_map>
40 #include <unordered_set>
41 #include <stdexcept>
42 
43 namespace genesis {
44 namespace tree {
45 
46 // =================================================================================================
47 // Table Reader
48 // =================================================================================================
49 
50 struct TreeTableHelpers
51 {
52  // Map from node names to their node index in the tree.
53  std::unordered_map<std::string, size_t> name_to_index;
54 
55  // Keep track of which node names appears as children and parents.
56  std::unordered_set<std::string> child_names;
57  std::unordered_set<std::string> parent_names;
58 };
59 
61  TreeTableHelpers& helpers,
62  std::string const& child_name,
63  std::string const& parent_name,
64  Tree& tree
65 ) {
66  // Each child can only have a single parent.
67  if( helpers.child_names.count( child_name ) > 0 ) {
68  throw std::runtime_error(
69  "Node name \"" + child_name + "\" appears multiple times as a child node"
70  );
71  }
72  helpers.child_names.insert( child_name );
73  helpers.parent_names.insert( parent_name );
74 
75  // Shortcut to tree containers.
76  auto& links = tree.expose_link_container();
77  auto& nodes = tree.expose_node_container();
78  auto& edges = tree.expose_edge_container();
79 
80  // Helper function to add a node to the tree and to the map.
81  auto add_named_node_ = [&]( std::string const& node_name )
82  {
83  // Get the index that the node will have in the tree.
84  auto const node_index = nodes.size();
85 
86  // Create a new node. For now, we use CommonNodeData for simplicty.
87  // We set all properties except for the link, which is done later.
88  auto new_node = utils::make_unique< TreeNode >();
89  new_node->reset_index( node_index );
90  new_node->reset_data( CommonNodeData::create() );
91  new_node->data<CommonNodeData>().name = node_name;
92 
93  // Finally, add everything to the tree and store it in the map.
94  nodes.push_back( std::move( new_node ));
95  helpers.name_to_index[ node_name ] = node_index;
96 
97  return node_index;
98  };
99 
100  // See if there is a parent with the name already.
101  // If not, we create it first, in the tree and in the map.
102  bool is_new_parent = false;
103  if( helpers.name_to_index.count( parent_name ) == 0 ) {
104  add_named_node_( parent_name );
105  is_new_parent = true;
106  }
107 
108  // Same for the child.
109  bool is_new_child = false;
110  if( helpers.name_to_index.count( child_name ) == 0 ) {
111  add_named_node_( child_name );
112  is_new_child = true;
113  }
114 
115  // Now we can get both nodes from the map.
116  auto* parent_node = &tree.node_at( helpers.name_to_index[ parent_name ]);
117  auto* child_node = &tree.node_at( helpers.name_to_index[ child_name ]);
118 
119  // At this point, we have the nodes, but not the links and the edge.
120  // and the nodes are missing the respective pointers to those.
121 
122  // First create the new elements we need, to have all pointers.
123  auto parent_link = utils::make_unique< TreeLink >();
124  auto child_link = utils::make_unique< TreeLink >();
125  auto new_edge = utils::make_unique< TreeEdge >();
126 
127  // At the parent, we make a new link to connect to the child node.
128  // If we just created the parent, this is the first link of the parent,
129  // so we need special treatment.
130  parent_link->reset_index( links.size() );
131  if( is_new_parent ) {
132  parent_link->reset_next( parent_link.get() );
133  parent_node->reset_primary_link( parent_link.get() );
134  } else {
135  parent_link->reset_next( &parent_node->primary_link().next() );
136  parent_node->primary_link().reset_next( parent_link.get() );
137  }
138  parent_link->reset_outer( child_link.get() );
139  parent_link->reset_node( parent_node );
140  parent_link->reset_edge( new_edge.get() );
141 
142  // Now the child link. Similar to the parent, but we additionally need to reset
143  // its primary link to the new edge, in case that it existed as a singular
144  // edge before, in order to connect it in the correct direction.
145  child_link->reset_index( links.size() + 1 );
146  if( is_new_child ) {
147  child_link->reset_next( child_link.get() );
148  child_node->reset_primary_link( child_link.get() );
149  } else {
150  child_link->reset_next( &child_node->primary_link().next() );
151  child_node->primary_link().reset_next( child_link.get() );
152  child_node->reset_primary_link( child_link.get() );
153  }
154  child_link->reset_outer( parent_link.get() );
155  child_link->reset_node( child_node );
156  child_link->reset_edge( new_edge.get() );
157 
158  // Set up the new edge to connect the node to its parent.
159  new_edge->reset_index( edges.size() );
160  new_edge->reset_primary_link( parent_link.get() );
161  new_edge->reset_secondary_link( child_link.get() );
162  new_edge->reset_data( CommonEdgeData::create() );
163 
164  // Finally move everything to the tree.
165  links.push_back( std::move( parent_link ));
166  links.push_back( std::move( child_link ));
167  edges.push_back( std::move( new_edge ));
168 }
169 
171  std::vector<std::string> const& child_names,
172  std::vector<std::string> const& parent_names
173 ) {
174  if( child_names.size() != parent_names.size() ) {
175  throw std::runtime_error(
176  "Cannot create tree from parents table with different number of entries in columns"
177  );
178  }
179  if( child_names.empty() ) {
180  return Tree();
181  }
182 
183  // Resulting tree.
184  Tree tree;
185 
186  // Make a hash table from node names to their node index in the tree.
187  // Use that to iterate through, once, and create all nodes.
188  TreeTableHelpers helpers;
189  for( size_t i = 0; i < parent_names.size(); ++i ) {
191  helpers, child_names[i], parent_names[i], tree
192  );
193  }
194 
195  // We now have a tree with all elements set up correctly, but not the root.
196  // We just check all nodes for the root propterty, and set the tree root to that.
197  // If multiple nodes fit that, we have an error, as that's a forest.
198  // That's hence a data integrity check.
199  std::vector<std::string> root_nodes;
200  for( auto& node : tree.nodes() ) {
201  if( is_root( node )) {
202  tree.reset_root_link( &node.primary_link() );
203  root_nodes.push_back( node.data<CommonNodeData>().name );
204  }
205  }
206  if( root_nodes.size() != 1 ) {
207  throw std::invalid_argument(
208  "Provided list of child and parent nodes does not form a singe tree, but a forest with " +
209  std::to_string( root_nodes.size() ) + " root nodes: " + utils::join( root_nodes )
210  );
211  }
212  assert( tree.node_count() == child_names.size() + 1 );
213 
214  // // We now want to find all names that are only parents, but not also children.
215  // // These are the root nodes. If there is more than one, this is an error.
216  // // This hence also serves as a sanity check for the tree root.
217  // size_t root_node_index = tree.node_count();
218  // for( auto const& parent : helpers.parent_names ) {
219  // if( helpers.child_names.count( parent ) == 0 ) {
220  // // Found a parent.
221  // if( root_node_index != tree.node_count() ) {
222  // throw std::runtime_error(
223  // "Node name lists contain multiple parent nodes."
224  // );
225  // }
226  // assert( helpers.name_to_index.count( parent ) > 0 );
227  // root_node_index = helpers.name_to_index[ parent ];
228  // }
229  // }
230  // assert( tree.root_node().index() == root_node_index );
231 
232  return tree;
233 }
234 
235 } // namespace tree
236 } // namespace genesis
genesis::tree::Tree::node_count
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.hpp:264
genesis::tree::make_tree_from_parents_table
Tree make_tree_from_parents_table(std::vector< std::string > const &child_names, std::vector< std::string > const &parent_names)
Create a tree, given lists of child parent pairs.
Definition: tree/formats/table/reader.cpp:170
genesis::tree::CommonEdgeData::create
static std::unique_ptr< CommonEdgeData > create()
Definition: tree/common_tree/tree.hpp:168
genesis::tree::Tree::expose_link_container
LinkContainerType & expose_link_container()
Get the container that stores all TreeLinks of the Tree.
Definition: tree/tree.cpp:192
functions.hpp
genesis::tree::Tree::expose_node_container
NodeContainerType & expose_node_container()
Get the container that stores all TreeNodes of the Tree.
Definition: tree/tree.cpp:197
genesis::tree::Tree::node_at
TreeNode & node_at(size_t index)
Return the TreeNode at a certain index.
Definition: tree/tree.hpp:220
std.hpp
Provides some valuable additions to STD.
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
string.hpp
Provides some commonly used string utility functions.
reader.hpp
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
genesis::tree::Tree::reset_root_link
Tree & reset_root_link(TreeLink *root_link)
Reset the link that is considered to be the root of the Tree.
Definition: tree/tree.cpp:184
genesis::tree::is_root
bool is_root(TreeLink const &link)
Return whether the link belongs to the root node of its Tree.
Definition: tree/function/functions.cpp:89
genesis::tree::CommonNodeData::name
std::string name
Name of the node.
Definition: tree/common_tree/tree.hpp:127
genesis::tree::Tree::expose_edge_container
EdgeContainerType & expose_edge_container()
Get the container that stores all TreeEdges of the Tree.
Definition: tree/tree.cpp:202
genesis::utils::join
Interval< DataType, NumericalType, IntervalKind > join(Interval< DataType, NumericalType, IntervalKind > const &a, Interval< DataType, NumericalType, IntervalKind > const &b)
Creates a new Interval that contains both intervals and whatever is between.
Definition: utils/containers/interval_tree/functions.hpp:127
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::tree::Tree::nodes
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.hpp:418
tree.hpp
genesis::tree::make_tree_from_parents_table_add_edge_
void make_tree_from_parents_table_add_edge_(TreeTableHelpers &helpers, std::string const &child_name, std::string const &parent_name, Tree &tree)
Definition: tree/formats/table/reader.cpp:60
genesis::tree::CommonNodeData
Common class containing the commonly needed data for tree nodes.
Definition: tree/common_tree/tree.hpp:79
genesis::tree::CommonNodeData::create
static std::unique_ptr< CommonNodeData > create()
Definition: tree/common_tree/tree.hpp:103