A toolkit for working with phylogenetic data.
v0.24.0
phylo_ilr.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
39 #include "genesis/tree/tree.hpp"
41 
44 
45 #include <cassert>
46 #include <cmath>
47 #include <stdexcept>
48 #include <unordered_set>
49 #include <vector>
50 
51 #ifdef GENESIS_OPENMP
52 # include <omp.h>
53 #endif
54 
55 namespace genesis {
56 namespace tree {
57 
58 // =================================================================================================
59 // Phylogenetic ILR Tranform
60 // =================================================================================================
61 
63  BalanceData const& data,
64  bool reverse_signs
65 ) {
66  // Basic checks specific for this function. More checks are done in mass_balance()
67  if( data.tree.empty() ) {
68  assert( data.edge_masses.size() == 0 );
69  assert( data.taxon_weights.size() == 0 );
70  return {};
71  }
72  if( ! is_rooted( data.tree )) {
73  throw std::invalid_argument(
74  "Tree is not rooted. Cannot calculate its Phylogenetic ILR tranform."
75  );
76  }
77  if( ! is_bifurcating( data.tree )) {
78  throw std::invalid_argument(
79  "Tree is not bifurcating. Cannot calculate its Phylogenetic ILR tranform."
80  );
81  }
82 
83  // Helper to get the edge indices of a particular subtree, including the edge that leads to it.
84  // It is slightly inefficient to first store the indices and then do the lookup of masses
85  // later, but this way, we save code duplication of the balance calculation function...
86  auto get_subtree_indices_ = []( Subtree const& subtree ){
87  std::unordered_set<size_t> sub_indices;
88  for( auto it : preorder( subtree )) {
89 
90  // The iterator visits each edge of the subtree once.
91  assert( sub_indices.count( it.edge().index() ) == 0 );
92 
93  // The preorder iterator is node based, so we start at the root node of the subtree,
94  // meaning that we automatically also insert its edge index! No special case needed.
95  sub_indices.insert( it.edge().index() );
96  }
97  return sub_indices;
98  };
99 
100  // Prepare result matrix.
101  auto result = utils::Matrix<double>( data.edge_masses.rows(), data.tree.node_count(), 0.0 );
102 
103  // Calculate balance for every node of the tree.
104  #pragma omp parallel for
105  for( size_t node_idx = 0; node_idx < data.tree.node_count(); ++node_idx ) {
106  auto const& node = data.tree.node_at( node_idx );
107  assert( node.index() == node_idx );
108 
109  // For leaf nodes do nothing. They just keep their initial value of 0.0.
110  auto const deg = degree( node );
111  if( deg == 1 ) {
112  continue;
113  }
114 
115  // Get the indices of the edges in the two subtrees down from the given node.
116  // We need a special case for the root, because its links are a bit different.
117  // (If we ignore this, we'd get a flipped sign at the root.)
118  std::unordered_set<size_t> lhs_indices;
119  std::unordered_set<size_t> rhs_indices;
120  if( deg == 2 ) {
121  assert( is_root( node ));
122 
123  // The tree is rooted, so that the root node, the left hand side is the primary
124  // link of the node itself, and the right hand side the next one.
125  lhs_indices = get_subtree_indices_( Subtree{ node.link().outer() });
126  rhs_indices = get_subtree_indices_( Subtree{ node.link().next().outer() });
127 
128  // After that, we should have all edges of the tree.
129  assert( lhs_indices.size() + rhs_indices.size() == data.tree.edge_count() );
130  } else {
131  assert( deg == 3 );
132 
133  // At inner nodes, the primary link points towards the root, so we use the next two links.
134  lhs_indices = get_subtree_indices_( Subtree{ node.link().next().outer() });
135  rhs_indices = get_subtree_indices_( Subtree{ node.link().next().next().outer() });
136 
137  // We never have more edges than the tree.
138  assert( lhs_indices.size() + rhs_indices.size() < data.tree.edge_count() );
139  }
140 
141  // If needed, flip lhs and rhs.
142  if( reverse_signs ) {
143  std::swap( lhs_indices, rhs_indices );
144  }
145 
146  // Calculate and store the balance for all rows (trees) of the data.
147  result.col( node_idx ) = mass_balance( data, lhs_indices, rhs_indices );
148  }
149 
150  return result;
151 }
152 
154  BalanceData const& data,
155  bool reverse_signs
156 ) {
157  // Basic checks specific for this function. More checks are done in mass_balance()
158  if( data.tree.empty() ) {
159  assert( data.edge_masses.size() == 0 );
160  assert( data.taxon_weights.size() == 0 );
161  return {};
162  }
163 
164  // Helper to get the edge indices of a particular subtree, excluding the edge that leads to it.
165  auto get_subtree_indices_ = []( Subtree const& subtree ){
166  std::unordered_set<size_t> sub_indices;
167  for( auto it : preorder( subtree )) {
168 
169  // The iterator visits each edge of the subtree once.
170  assert( sub_indices.count( it.edge().index() ) == 0 );
171 
172  // Skip the top edge, where the subtree is attached.
173  if( it.is_first_iteration() ) {
174  continue;
175  }
176 
177  sub_indices.insert( it.edge().index() );
178  }
179  return sub_indices;
180  };
181 
182  // Prepare result matrix.
183  auto result = utils::Matrix<double>( data.edge_masses.rows(), data.tree.edge_count(), 0.0 );
184 
185  // Calculate balance for every node of the tree.
186  #pragma omp parallel for
187  for( size_t edge_idx = 0; edge_idx < data.tree.edge_count(); ++edge_idx ) {
188  auto const& edge = data.tree.edge_at( edge_idx );
189  assert( edge.index() == edge_idx );
190 
191  // For leaf edges do nothing. They just keep their initial value of 0.0.
192  if( is_leaf( edge )) {
193  continue;
194  }
195 
196  // Get the indices of the edges in the two subtrees away from the given edge.
197  auto p_indices = get_subtree_indices_( Subtree{ edge.primary_link() });
198  auto s_indices = get_subtree_indices_( Subtree{ edge.secondary_link() });
199 
200  // After that, we should have all edges of the tree except one.
201  assert( p_indices.size() + s_indices.size() == data.tree.edge_count() - 1 );
202 
203  // If needed, flip lhs and rhs.
204  if( reverse_signs ) {
205  std::swap( p_indices, s_indices );
206  }
207 
208  // Calculate and store the balance for all rows (trees) of the data.
209  result.col( edge_idx ) = mass_balance( data, s_indices, p_indices );
210  }
211 
212  return result;
213 }
214 
215 } // namespace tree
216 } // namespace genesis
bool is_rooted(Tree const &tree)
Return whether the Tree is rooted, that is, whether the root node has two neighbors.
bool is_root(TreeLink const &link)
Return whether the link belongs to the root node of its Tree.
utils::Matrix< double > edge_balances(BalanceData const &data, bool reverse_signs)
Calculate edge balances using the Isometric Log Ratio transformation.
Definition: phylo_ilr.cpp:153
void swap(SequenceSet &lhs, SequenceSet &rhs)
Tree operator functions.
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.hpp:194
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
bool is_bifurcating(Tree const &tree, bool loose)
Return whether the Tree is bifurcating.
Data needed to calculate balances.
Definition: balances.hpp:170
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
Reference to a subtree of a Tree.
Definition: subtree.hpp:69
TreeNode & node_at(size_t index)
Return the TreeNode at a certain index.
Definition: tree/tree.hpp:220
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
std::vector< double > mass_balance(BalanceData const &data, std::unordered_set< size_t > const &numerator_edge_indices, std::unordered_set< size_t > const &denominator_edge_indices)
Calculate the balance of edge masses between two sets of edges.
Definition: balances.cpp:357
TreeLink const & link() const
Get the TreeLink that separates the subtree from the rest of the tree.
Definition: subtree.hpp:125
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.hpp:264
utils::Matrix< double > phylogenetic_ilr_transform(BalanceData const &data, bool reverse_signs)
Calculate the Phylogenetic Isometric Log Ratio transformation.
Definition: phylo_ilr.cpp:62
utils::Range< IteratorPreorder< true > > preorder(ElementType const &element)
Tree tree
The Tree on which to calculate balances.
Definition: balances.hpp:179
std::vector< double > taxon_weights
The taxon/edge weights calculated from mulitple trees.
Definition: balances.hpp:229
Header of Tree class.
utils::Matrix< double > edge_masses
The relative per-edge masses per original input Tree.
Definition: balances.hpp:197
size_t degree(TreeLink const &link)
Return the degree of the node for a given TreeLink, i.e. how many neighbouring nodes it has...
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.