A library for working with phylogenetic and population genetic data.
v0.32.0
epca.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
40 
47 
48 #include <algorithm>
49 #include <cassert>
50 #include <cmath>
51 #include <numeric>
52 #include <stdexcept>
53 #include <vector>
54 
55 #ifdef GENESIS_OPENMP
56 # include <omp.h>
57 #endif
58 
59 namespace genesis {
60 namespace placement {
61 
62 // =================================================================================================
63 // Edge PCA Imbalance Vector
64 // =================================================================================================
65 
66 std::vector<double> epca_imbalance_vector( Sample const& sample, bool normalize )
67 {
68  // Result vector: imbalance of masses at each edge of the tree.
69  auto vec = std::vector<double>( sample.tree().edge_count(), 0.0 );
70 
71  // We need the masses per edge, and their sum, for later.
72  auto const masses = placement_mass_per_edges_with_multiplicities( sample );
73  auto const mass_sum = std::accumulate( masses.begin(), masses.end(), 0.0 );
74 
75  // Collect the placement masses at each link of the tree.
76  // Use init to -1 as indicator for assertions.
77  auto link_masses = std::vector<double>( sample.tree().link_count(), -1.0 );
78 
79  for( auto tree_it : postorder( sample.tree() )) {
80 
81  // Skip the last iteration. We are interested in edges, not in nodes.
82  if( tree_it.is_last_iteration() ) {
83  continue;
84  }
85 
86  // Get the indices of the links at both sides of the current edge.
87  // cur_idx is the link that points away from the root,
88  // out_idx is the link that points towards it (i.e., its subtree contains the root).
89  auto const cur_idx = tree_it.link().index();
90  auto const out_idx = tree_it.link().outer().index();
91 
92  // Assert that we have not processed those links before.
93  assert( link_masses[ cur_idx ] < 0.0 );
94  assert( link_masses[ out_idx ] < 0.0 );
95 
96  // Assert that the cur_idx belongs to the link away from the root.
97  // This is the case if the primary link of its node is the link itself,
98  // because the node uses this link to point towards the root - thus, the link itself
99  // is away from the root, while the out_idx link lies towards it.
100  assert( sample.tree().link_at( cur_idx ).node().primary_link().index() == cur_idx );
101 
102  // Some more ways to do the same assertion, just to be sure.
103  assert( tree_it.edge().index() == sample.tree().link_at( cur_idx ).edge().index() );
104  assert( tree_it.edge().primary_link().index() == out_idx );
105  assert( tree_it.edge().secondary_link().index() == cur_idx );
106 
107  // Leaf links have no mass.
108  if( is_leaf( tree_it.link() )) {
109  link_masses[ cur_idx ] = 0.0;
110 
111  // If the link belongs to an inner node, we calculate its mass as the sum of the masses
112  // of the other links of this node. Those have already been processed, as we are doing
113  // postorder traversal.
114  } else {
115 
116  // Collect the mass.
117  double round_sum = 0.0;
118 
119  // Iterate around all other links of the node that belongs to the cur_idx link.
120  auto round_link = &tree_it.link().next();
121  while( round_link != &tree_it.link() ) {
122 
123  // We are doing postorder traversal, so we should have seen this link before.
124  assert( link_masses[ round_link->index() ] >= 0.0 );
125 
126  // The mass of the subtree behind this link can be calculated from the total mass
127  // minus the mass of the link itself.
128  round_sum += mass_sum - link_masses[ round_link->index() ];
129 
130  // Next link of the node.
131  round_link = &round_link->next();
132  }
133 
134  // The sum should always be >0, but for numerical reaons, we better make sure it is.
135  link_masses[ cur_idx ] = std::max( 0.0, round_sum );
136  }
137 
138  // Calculate the mass at the other side of the edge. We need to correct negative values,
139  // which can occur for numerical reasons (in the order of e-12).
140  link_masses[out_idx] = std::max(
141  0.0,
142  mass_sum - link_masses[ cur_idx ] - masses[ tree_it.edge().index() ]
143  );
144 
145  // Make sure we have processed all masses that we are going to use.
146  assert( link_masses[cur_idx] > -1.0 );
147  assert( link_masses[out_idx] > -1.0 );
148 
149  // Finally, calculate the imbalance of the current edge,
150  // normalized by the total mass on the tree (expect for the mass of the current edge).
151  auto const imbalance = link_masses[cur_idx] - link_masses[out_idx];
152  if( normalize ) {
153  auto const normalizer = mass_sum - masses[ tree_it.edge().index() ];
154  assert( normalizer > 0.0 );
155  vec[ tree_it.edge().index() ] = imbalance / normalizer;
156  } else {
157  vec[ tree_it.edge().index() ] = imbalance;
158  }
159  }
160 
161  return vec;
162 }
163 
164 // =================================================================================================
165 // Edge PCA Imbalance Matrix
166 // =================================================================================================
167 
169  SampleSet const& samples,
170  bool include_leaves,
171  bool normalize
172 ) {
173  // If there are no samples, return empty matrix.
174  if( samples.size() == 0 ) {
175  return utils::Matrix<double>();
176  }
177 
178  // Check if all trees have the same tpology and edge nums.
179  if( ! all_identical_trees( samples )) {
180  throw std::runtime_error(
181  "Cannot calculate Edge PCA on trees that have a different topology."
182  );
183  }
184 
185  assert( samples.size() > 0 );
186  auto const edge_count = samples.at( 0 ).tree().edge_count();
187 
188  if( include_leaves ) {
189 
190  auto imbalance_matrix = utils::Matrix<double>( samples.size(), edge_count );
191 
192  #pragma omp parallel for
193  for( size_t s = 0; s < samples.size(); ++s ) {
194  auto const& smp = samples[s];
195  auto const imbalance_vec = epca_imbalance_vector( smp, normalize );
196 
197  // We need to have the right number of imbalance values.
198  assert( imbalance_vec.size() == edge_count );
199 
200  // Copy imbalance values to the matrix.
201  for( size_t i = 0; i < edge_count; ++i ) {
202 
203  // Either the edge is an inner edge, or (if not, i.e., it leads to a leaf),
204  // it's imbalance is minus 1, as all its mass is on the root side.
205  assert(
206  ! is_leaf( samples[s].tree().edge_at(i).secondary_node() ) ||
207  utils::almost_equal_relative( imbalance_vec[ i ], -1.0 )
208  );
209 
210  imbalance_matrix( s, i ) = imbalance_vec[ i ];
211  }
212  }
213 
214  return imbalance_matrix;
215 
216  } else {
217 
218  // Get the indices of all edges that do not lead to a tip.
219  auto const inner_edge_indices = tree::inner_edge_indices( samples.at( 0 ).tree() );
220 
221  // Prepare result
222  auto imbalance_matrix = utils::Matrix<double>( samples.size(), inner_edge_indices.size() );
223 
224  #pragma omp parallel for
225  for( size_t s = 0; s < samples.size(); ++s ) {
226  auto const& smp = samples[s];
227  auto const imbalance_vec = epca_imbalance_vector( smp, normalize );
228 
229  // We need to have the right number of imbalance values, which also needs to be
230  // smaller than the number of inner edges (there can be no tree with just inner
231  // edges, thus the total number of edges has to be bigger).
232  assert( imbalance_vec.size() == edge_count );
233  assert( imbalance_vec.size() > inner_edge_indices.size() );
234 
235  // Copy those imbalance values to the matrix that belong to inner edges.
236  for( size_t i = 0; i < inner_edge_indices.size(); ++i ) {
237  auto idx = inner_edge_indices[i];
238  imbalance_matrix( s, i ) = imbalance_vec[ idx ];
239  }
240  }
241 
242  return imbalance_matrix;
243  }
244 }
245 
246 // =================================================================================================
247 // Splitify Transform with Kappa
248 // =================================================================================================
249 
250 void epca_splitify_transform( utils::Matrix<double>& imbalance_matrix, double kappa )
251 {
252  // Precondition check.
253  if( kappa < 0.0 ) {
254  throw std::runtime_error( "Argument for kappa must be non-negative." );
255  }
256 
257  // Save time if the transformation throws away the actual value.
258  // We do not need to calculate the abs and power in this case.
259  if( kappa == 0.0 ) {
260  for( auto& elem : imbalance_matrix ) {
261  elem = static_cast<double>( utils::signum( elem ));
262  }
263  return;
264  }
265 
266  // Save time if the transformation does not change anything.
267  if( kappa == 1.0 ) {
268  return;
269  }
270 
271  // If neither applies, do the full transformation.
272  for( auto& elem : imbalance_matrix ) {
273  elem = static_cast<double>( utils::signum( elem )) * std::pow( std::abs( elem ), kappa );
274  }
275 }
276 
277 // =================================================================================================
278 // Edge PCA
279 // =================================================================================================
280 
281 EpcaData epca( SampleSet const& samples, double kappa, double epsilon, size_t components )
282 {
283  // If there are no samples, return empty result.
284  if( samples.size() == 0 ) {
285  return EpcaData();
286  }
287 
288  // Calculate the imbalance_matrix.
289  auto imbalance_matrix = epca_imbalance_matrix( samples, false );
290  assert( samples.size() > 0 );
291  assert( imbalance_matrix.rows() == samples.size() );
292  assert( imbalance_matrix.cols() == tree::inner_edge_count( samples[0].tree() ) );
293 
294  // Get the indices of the inner edges.
295  auto const inner_edge_indices = tree::inner_edge_indices( samples.at( 0 ).tree() );
296  assert( imbalance_matrix.cols() == inner_edge_indices.size() );
297 
298  // Filter and transform the imbalance matrix.
299  auto const not_filtered_indices = filter_constant_columns( imbalance_matrix, epsilon );
300  epca_splitify_transform( imbalance_matrix, kappa );
301 
302  // We now use the list of not filtered indices to selected from the list of inner edge indices.
303  // The result is just the indices of the edges that are still in the matrix.
304  std::vector<size_t> edge_indices;
305  for( auto const& not_filt : not_filtered_indices ) {
306  edge_indices.push_back( inner_edge_indices[ not_filt ] );
307  }
308  assert( edge_indices.size() == imbalance_matrix.cols() );
309 
310  // Get correct number of pca components.
311  if( components == 0 || components > imbalance_matrix.cols() ) {
312  components = imbalance_matrix.cols();
313  }
314 
315  // Run and return PCA.
317  imbalance_matrix, components, utils::PcaStandardization::kCovariance
318  );
319  assert( pca.eigenvalues.size() == components );
320  assert( pca.eigenvectors.rows() == edge_indices.size() );
321  assert( pca.eigenvectors.cols() == components );
322  assert( pca.projection.rows() == samples.size() );
323  assert( pca.projection.cols() == components );
324 
325  // Move data.
326  EpcaData result;
327  result.eigenvalues = std::move( pca.eigenvalues );
328  result.eigenvectors = std::move( pca.eigenvectors );
329  result.projection = std::move( pca.projection );
330  result.edge_indices = std::move( edge_indices );
331  return result;
332 }
333 
334 } // namespace placement
335 } // namespace genesis
genesis::placement::Sample::tree
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:124
genesis::placement::SampleSet
Store a set of Samples with associated names.
Definition: sample_set.hpp:54
genesis::placement::epca_imbalance_matrix
utils::Matrix< double > epca_imbalance_matrix(SampleSet const &samples, bool include_leaves, bool normalize)
Calculate the imbalance matrix of placment mass for all Samples in a SampleSet.
Definition: epca.cpp:168
genesis::tree::Tree::link_count
size_t link_count() const
Return the number of TreeLinks of the Tree.
Definition: tree/tree.hpp:256
genesis::utils::signum
constexpr int signum(T x, std::false_type)
Implementation of signum(T x) for unsigned types. See there for details.
Definition: common.hpp:104
genesis::utils::almost_equal_relative
bool almost_equal_relative(double lhs, double rhs, double max_rel_diff=std::numeric_limits< double >::epsilon())
Check whether two doubles are almost equal, using a relative epsilon to compare them.
Definition: common.hpp:157
common.hpp
genesis::placement::Sample
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on.
Definition: sample.hpp:68
pca.hpp
genesis::tree::TreeEdge::index
size_t index() const
Return the index of this Edge.
Definition: edge.hpp:106
genesis::placement::epca_imbalance_vector
std::vector< double > epca_imbalance_vector(Sample const &sample, bool normalize)
Calculate the imbalance of placement mass for each Edge of the given Sample.
Definition: epca.cpp:66
functions.hpp
genesis::placement::EpcaData
Helper stucture that collects the output of epca().
Definition: epca.hpp:76
genesis::tree::inner_edge_indices
std::vector< size_t > inner_edge_indices(Tree const &tree)
Get a list of the edge indices of all inner edges, that is, all TreeEdges that do not lead to a leaf ...
Definition: tree/function/functions.cpp:217
std.hpp
Provides some valuable additions to STD.
functions.hpp
Provides functions for working with Placements and Pqueries.
genesis::utils::Matrix< double >
genesis::tree::TreeNode::primary_link
TreeLink & primary_link()
Return the TreeLink that points towards the root.
Definition: tree/tree/node.hpp:110
genesis::placement::EpcaData::eigenvectors
utils::Matrix< double > eigenvectors
Definition: epca.hpp:79
genesis::utils::PcaStandardization::kCovariance
@ kCovariance
Standardize the mean, but not the variance of the data before performing the PCA.
matrix.hpp
epca.hpp
masses.hpp
matrix.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::tree::inner_edge_count
size_t inner_edge_count(Tree const &tree)
Return the number of Edges of a Tree that do not lead to a leaf Node.
Definition: tree/function/functions.cpp:201
genesis::utils::principal_component_analysis
PcaData principal_component_analysis(Matrix< double > const &data, size_t components, PcaStandardization standardization)
Perfom a Principal Component Analysis on a given data Matrix.
Definition: pca.cpp:260
genesis::placement::EpcaData::projection
utils::Matrix< double > projection
Definition: epca.hpp:80
genesis::placement::SampleSet::size
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
genesis::placement::SampleSet::at
Sample & at(size_t index)
Definition: sample_set.hpp:184
genesis::placement::placement_mass_per_edges_with_multiplicities
std::vector< double > placement_mass_per_edges_with_multiplicities(Sample const &sample)
Return a vector that contains the sum of the masses of the PqueryPlacements per edge of the tree of t...
Definition: masses.cpp:87
operators.hpp
Matrix operators.
genesis::tree::Tree::link_at
TreeLink & link_at(size_t index)
Return the TreeLink at a certain index.
Definition: tree/tree.hpp:202
genesis::placement::EpcaData::eigenvalues
std::vector< double > eigenvalues
Definition: epca.hpp:78
postorder.hpp
genesis::tree::is_leaf
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Definition: tree/function/functions.cpp:59
sample_set.hpp
genesis::utils::filter_constant_columns
std::vector< size_t > filter_constant_columns(utils::Matrix< double > &data, double epsilon)
Filter out columns that have nearly constant values, measured using an epsilon.
Definition: math/matrix.cpp:158
genesis::utils::normalize
void normalize(Histogram &h, double total)
Definition: operations.cpp:61
genesis::placement::all_identical_trees
bool all_identical_trees(SampleSet const &sample_set)
Returns true iff all Trees of the Samples in the set are identical.
Definition: sample_set.cpp:125
genesis::tree::edge_count
size_t edge_count(Tree const &tree)
Return the number of Edges of a Tree. Same as Tree::edge_count().
Definition: tree/function/functions.cpp:212
helper.hpp
genesis::tree::postorder
utils::Range< IteratorPostorder< true > > postorder(ElementType const &element)
Definition: tree/iterator/postorder.hpp:321
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
genesis::placement::epca_splitify_transform
void epca_splitify_transform(utils::Matrix< double > &imbalance_matrix, double kappa)
Perform a component-wise transformation of the imbalance matrix used for epca().
Definition: epca.cpp:250
genesis::placement::EpcaData::edge_indices
std::vector< size_t > edge_indices
Definition: epca.hpp:81
genesis::placement::epca
EpcaData epca(SampleSet const &samples, double kappa, double epsilon, size_t components)
Perform EdgePCA on a SampleSet.
Definition: epca.cpp:281