A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
epca.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
40 
47 
48 #include <algorithm>
49 #include <cassert>
50 #include <cmath>
51 #include <numeric>
52 #include <stdexcept>
53 #include <vector>
54 
55 #ifdef GENESIS_OPENMP
56 # include <omp.h>
57 #endif
58 
59 namespace genesis {
60 namespace placement {
61 
62 // =================================================================================================
63 // Edge PCA Imbalance Vector
64 // =================================================================================================
65 
66 std::vector<double> epca_imbalance_vector( Sample const& sample, bool normalize )
67 {
68  // Result vector: imbalance of masses at each edge of the tree.
69  auto vec = std::vector<double>( sample.tree().edge_count(), 0.0 );
70 
71  // We need the masses per edge, and their sum, for later.
72  auto const masses = placement_mass_per_edges_with_multiplicities( sample );
73  auto const mass_sum = std::accumulate( masses.begin(), masses.end(), 0.0 );
74 
75  // Collect the placement masses at each link of the tree.
76  // Use init to -1 as indicator for assertions.
77  auto link_masses = std::vector<double>( sample.tree().link_count(), -1.0 );
78 
79  for( auto tree_it : postorder( sample.tree() )) {
80 
81  // Skip the last iteration. We are interested in edges, not in nodes.
82  if( tree_it.is_last_iteration() ) {
83  continue;
84  }
85 
86  // Get the indices of the links at both sides of the current edge.
87  // cur_idx is the link that points away from the root,
88  // out_idx is the link that points towards it (i.e., its subtree contains the root).
89  auto const cur_idx = tree_it.link().index();
90  auto const out_idx = tree_it.link().outer().index();
91 
92  // Assert that we have not processed those links before.
93  assert( link_masses[ cur_idx ] < 0.0 );
94  assert( link_masses[ out_idx ] < 0.0 );
95 
96  // Assert that the cur_idx belongs to the link away from the root.
97  // This is the case if the primary link of its node is the link itself,
98  // because the node uses this link to point towards the root - thus, the link itself
99  // is away from the root, while the out_idx link lies towards it.
100  assert( sample.tree().link_at( cur_idx ).node().primary_link().index() == cur_idx );
101 
102  // Some more ways to do the same assertion, just to be sure.
103  assert( tree_it.edge().index() == sample.tree().link_at( cur_idx ).edge().index() );
104  assert( tree_it.edge().primary_link().index() == out_idx );
105  assert( tree_it.edge().secondary_link().index() == cur_idx );
106 
107  // Leaf links have no mass.
108  if( is_leaf( tree_it.link() )) {
109  link_masses[ cur_idx ] = 0.0;
110 
111  // If the link belongs to an inner node, we calculate its mass as the sum of the masses
112  // of the other links of this node. Those have already been processed, as we are doing
113  // postorder traversal.
114  } else {
115 
116  // Collect the mass.
117  double round_sum = 0.0;
118 
119  // Iterate around all other links of the node that belongs to the cur_idx link.
120  auto round_link = &tree_it.link().next();
121  while( round_link != &tree_it.link() ) {
122 
123  // We are doing postorder traversal, so we should have seen this link before.
124  assert( link_masses[ round_link->index() ] >= 0.0 );
125 
126  // The mass of the subtree behind this link can be calculated from the total mass
127  // minus the mass of the link itself.
128  round_sum += mass_sum - link_masses[ round_link->index() ];
129 
130  // Next link of the node.
131  round_link = &round_link->next();
132  }
133 
134  // The sum should always be >0, but for numerical reaons, we better make sure it is.
135  link_masses[ cur_idx ] = std::max( 0.0, round_sum );
136  }
137 
138  // Calculate the mass at the other side of the edge. We need to correct negative values,
139  // which can occur for numerical reasons (in the order of e-12).
140  link_masses[out_idx] = std::max(
141  0.0,
142  mass_sum - link_masses[ cur_idx ] - masses[ tree_it.edge().index() ]
143  );
144 
145  // Make sure we have processed all masses that we are going to use.
146  assert( link_masses[cur_idx] > -1.0 );
147  assert( link_masses[out_idx] > -1.0 );
148 
149  // Finally, calculate the imbalance of the current edge,
150  // normalized by the total mass on the tree (expect for the mass of the current edge).
151  auto const imbalance = link_masses[cur_idx] - link_masses[out_idx];
152  if( normalize ) {
153  auto const normalizer = mass_sum - masses[ tree_it.edge().index() ];
154  assert( normalizer > 0.0 );
155  vec[ tree_it.edge().index() ] = imbalance / normalizer;
156  } else {
157  vec[ tree_it.edge().index() ] = imbalance;
158  }
159  }
160 
161  return vec;
162 }
163 
164 // =================================================================================================
165 // Edge PCA Imbalance Matrix
166 // =================================================================================================
167 
169  SampleSet const& samples,
170  bool include_leaves,
171  bool normalize
172 ) {
173  // If there are no samples, return empty matrix.
174  if( samples.size() == 0 ) {
175  return utils::Matrix<double>();
176  }
177 
178  // Check if all trees have the same tpology and edge nums.
179  if( ! all_identical_trees( samples )) {
180  throw std::runtime_error(
181  "Cannot calculate Edge PCA on trees that have a different topology."
182  );
183  }
184 
185  assert( samples.size() > 0 );
186  auto const edge_count = samples.at( 0 ).sample.tree().edge_count();
187 
188  if( include_leaves ) {
189 
190  auto imbalance_matrix = utils::Matrix<double>( samples.size(), edge_count );
191 
192  #pragma omp parallel for
193  for( size_t s = 0; s < samples.size(); ++s ) {
194  auto const& smp = samples[s].sample;
195  auto const imbalance_vec = epca_imbalance_vector( smp, normalize );
196 
197  // We need to have the right number of imbalance values.
198  assert( imbalance_vec.size() == edge_count );
199 
200  // Copy imbalance values to the matrix.
201  for( size_t i = 0; i < edge_count; ++i ) {
202 
203  // Either the edge is an inner edge, or (if not, i.e., it leads to a leaf),
204  // it's imbalance is minus 1, as all its mass is on the root side.
205  assert(
206  ! is_leaf( samples[s].sample.tree().edge_at(i).secondary_node() ) ||
207  utils::almost_equal_relative( imbalance_vec[ i ], -1.0 )
208  );
209 
210  imbalance_matrix( s, i ) = imbalance_vec[ i ];
211  }
212  }
213 
214  return imbalance_matrix;
215 
216  } else {
217 
218  // Get the indices of all edges that do not lead to a tip.
219  auto const inner_edge_indices = tree::inner_edge_indices( samples.at( 0 ).sample.tree() );
220 
221  // Prepare result
222  auto imbalance_matrix = utils::Matrix<double>( samples.size(), inner_edge_indices.size() );
223 
224  #pragma omp parallel for
225  for( size_t s = 0; s < samples.size(); ++s ) {
226  auto const& smp = samples[s].sample;
227  auto const imbalance_vec = epca_imbalance_vector( smp, normalize );
228 
229  // We need to have the right number of imbalance values, which also needs to be
230  // smaller than the number of inner edges (there can be no tree with just inner
231  // edges, thus the total number of edges has to be bigger).
232  assert( imbalance_vec.size() == edge_count );
233  assert( imbalance_vec.size() > inner_edge_indices.size() );
234 
235  // Copy those imbalance values to the matrix that belong to inner edges.
236  for( size_t i = 0; i < inner_edge_indices.size(); ++i ) {
237  auto idx = inner_edge_indices[i];
238  imbalance_matrix( s, i ) = imbalance_vec[ idx ];
239  }
240  }
241 
242  return imbalance_matrix;
243  }
244 }
245 
246 // =================================================================================================
247 // Splitify Transform with Kappa
248 // =================================================================================================
249 
250 void epca_splitify_transform( utils::Matrix<double>& imbalance_matrix, double kappa )
251 {
252  // Precondition check.
253  if( kappa < 0.0 ) {
254  throw std::runtime_error( "Argument for kappa must be non-negative." );
255  }
256 
257  // Save time if the transformation throws away the actual value.
258  // We do not need to calculate the abs and power in this case.
259  if( kappa == 0.0 ) {
260  for( auto& elem : imbalance_matrix ) {
261  elem = static_cast<double>( utils::signum( elem ));
262  }
263  return;
264  }
265 
266  // Save time if the transformation does not change anything.
267  if( kappa == 1.0 ) {
268  return;
269  }
270 
271  // If neither applies, do the full transformation.
272  for( auto& elem : imbalance_matrix ) {
273  elem = static_cast<double>( utils::signum( elem )) * std::pow( std::abs( elem ), kappa );
274  }
275 }
276 
277 // =================================================================================================
278 // Filter Constant Columns
279 // =================================================================================================
280 
281 std::vector<size_t> epca_filter_constant_columns( utils::Matrix<double>& imbalance_matrix, double epsilon )
282 {
283  // Get the column-wise min and max values.
284  auto const col_minmax = utils::matrix_col_minmax( imbalance_matrix );
285 
286  // Store which columns to keep, by index.
287  std::vector<size_t> keep_cols;
288  for( size_t c = 0; c < imbalance_matrix.cols(); ++c ) {
289  assert( col_minmax[c].min <= col_minmax[c].max );
290  if (( col_minmax[c].max - col_minmax[c].min ) > epsilon ) {
291  keep_cols.push_back( c );
292  }
293  }
294  assert( keep_cols.size() <= imbalance_matrix.cols() );
295 
296  // Produce new, filtered matrix.
297  auto new_mat = utils::Matrix<double>( imbalance_matrix.rows(), keep_cols.size() );
298 
299  #pragma omp parallel for
300  for( size_t r = 0; r < imbalance_matrix.rows(); ++r ) {
301  for( size_t i = 0; i < keep_cols.size(); ++i ) {
302  new_mat( r, i ) = imbalance_matrix( r, keep_cols[i] );
303  }
304  }
305 
306  // Overwrite the matrix.
307  imbalance_matrix = new_mat;
308  return keep_cols;
309 }
310 
311 // =================================================================================================
312 // Edge PCA
313 // =================================================================================================
314 
315 EpcaData epca( SampleSet const& samples, double kappa, double epsilon, size_t components )
316 {
317  // If there are no samples, return empty result.
318  if( samples.size() == 0 ) {
319  return EpcaData();
320  }
321 
322  // Calculate the imbalance_matrix.
323  auto imbalance_matrix = epca_imbalance_matrix( samples, false );
324  assert( samples.size() > 0 );
325  assert( imbalance_matrix.rows() == samples.size() );
326  assert( imbalance_matrix.cols() == tree::inner_edge_count( samples[0].sample.tree() ) );
327 
328  // Get the indices of the inner edges.
329  auto const inner_edge_indices = tree::inner_edge_indices( samples.at( 0 ).sample.tree() );
330  assert( imbalance_matrix.cols() == inner_edge_indices.size() );
331 
332  // Filter and transform the imbalance matrix.
333  auto const not_filtered_indices = epca_filter_constant_columns( imbalance_matrix, epsilon );
334  epca_splitify_transform( imbalance_matrix, kappa );
335 
336  // We now use the list of not filtered indices to selected from the list of inner edge indices.
337  // The result is just the indices of the edges that are still in the matrix.
338  std::vector<size_t> edge_indices;
339  for( auto const& not_filt : not_filtered_indices ) {
340  edge_indices.push_back( inner_edge_indices[ not_filt ] );
341  }
342  assert( edge_indices.size() == imbalance_matrix.cols() );
343 
344  // Get correct number of pca components.
345  if( components == 0 || components > imbalance_matrix.cols() ) {
346  components = imbalance_matrix.cols();
347  }
348 
349  // Run and return PCA.
351  imbalance_matrix, components, utils::PcaStandardization::kCovariance
352  );
353  assert( pca.eigenvalues.size() == components );
354  assert( pca.eigenvectors.rows() == edge_indices.size() );
355  assert( pca.eigenvectors.cols() == components );
356  assert( pca.projection.rows() == samples.size() );
357  assert( pca.projection.cols() == components );
358 
359  // Move data.
360  EpcaData result;
361  result.eigenvalues = std::move( pca.eigenvalues );
362  result.eigenvectors = std::move( pca.eigenvectors );
363  result.projection = std::move( pca.projection );
364  result.edge_indices = std::move( edge_indices );
365  return result;
366 }
367 
368 } // namespace placement
369 } // namespace genesis
size_t index() const
Return the index of this Edge.
Definition: edge.hpp:105
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
std::vector< MinMaxPair< T > > matrix_col_minmax(Matrix< T > const &data)
Calculate the column-wise min and max values of a Matrix.
Definition: math/matrix.hpp:77
size_t inner_edge_count(Tree const &tree)
Return the number of Edges of a Tree that do not lead to a leaf Node.
Standardize the mean, but not the variance of the data before performing the PCA. ...
TreeLink & primary_link()
Return the TreeLink that points towards the root.
Definition: node.hpp:108
Provides functions for working with Placements and Pqueries.
constexpr int signum(T x, std::false_type)
Implementation of signum(T x) for unsigned types. See there for details.
Definition: common.hpp:71
utils::Matrix< double > epca_imbalance_matrix(SampleSet const &samples, bool include_leaves, bool normalize)
Calculate the imbalance matrix of placment mass for all Samples in a SampleSet.
Definition: epca.cpp:168
bool all_identical_trees(SampleSet const &sample_set)
Returns true iff all Trees of the Samples in the set are identical.
Definition: sample_set.cpp:125
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Provides some valuable additions to STD.
TreeLink & link_at(size_t index)
Return the TreeLink at a certain index.
Definition: tree/tree.cpp:284
Matrix operators.
EpcaData epca(SampleSet const &samples, double kappa, double epsilon, size_t components)
Perform EdgePCA on a SampleSet.
Definition: epca.cpp:315
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
Helper stucture that collects the output of epca().
Definition: epca.hpp:76
std::vector< double > eigenvalues
Definition: epca.hpp:78
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
void normalize(Histogram &h, double total)
Definition: operations.cpp:61
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
std::vector< double > epca_imbalance_vector(Sample const &sample, bool normalize)
Calculate the imbalance of placement mass for each Edge of the given Sample.
Definition: epca.cpp:66
NamedSample & at(size_t index)
Get the NamedSample at a certain index position.
Definition: sample_set.hpp:194
PcaData principal_component_analysis(Matrix< double > const &data, size_t components, PcaStandardization standardization)
Perfom a Principal Component Analysis on a given data Matrix.
Definition: pca.cpp:242
std::vector< double > placement_mass_per_edges_with_multiplicities(Sample const &sample)
Return a vector that contains the sum of the masses of the PqueryPlacements per edge of the tree of t...
Definition: masses.cpp:87
void epca_splitify_transform(utils::Matrix< double > &imbalance_matrix, double kappa)
Perform a component-wise transformation of the imbalance matrix used for epca().
Definition: epca.cpp:250
utils::Range< IteratorPostorder< TreeLink const, TreeNode const, TreeEdge const > > postorder(ElementType const &element)
std::vector< size_t > epca_filter_constant_columns(utils::Matrix< double > &imbalance_matrix, double epsilon)
Filter out columns that have nearly constant values, measured using an epsilon.
Definition: epca.cpp:281
std::vector< size_t > inner_edge_indices(Tree const &tree)
Get a list of the edge indices of all inner edges, that is, all TreeEdges that do not lead to a leaf ...
size_t link_count() const
Return the number of TreeLinks of the Tree.
Definition: tree/tree.cpp:342
size_t edge_count(Tree const &tree)
Return the number of Edges of a Tree. Same as Tree::edge_count().
bool almost_equal_relative(double lhs, double rhs, double max_rel_diff=std::numeric_limits< double >::epsilon())
Check whether two doubles are almost equal, using a relative epsilon to compare them.
Definition: common.hpp:108