A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
epca.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
38 
45 
46 #include <algorithm>
47 #include <cassert>
48 #include <cmath>
49 #include <numeric>
50 #include <stdexcept>
51 #include <vector>
52 
53 #ifdef GENESIS_OPENMP
54 # include <omp.h>
55 #endif
56 
57 namespace genesis {
58 namespace placement {
59 
60 // =================================================================================================
61 // Edge PCA Imbalance Vector
62 // =================================================================================================
63 
64 std::vector<double> epca_imbalance_vector( Sample const& sample, bool normalize )
65 {
66  // Result vector: imbalance of masses at each edge of the tree.
67  auto vec = std::vector<double>( sample.tree().edge_count(), 0.0 );
68 
69  // We need the masses per edge, and their sum, for later.
70  auto const masses = placement_weight_per_edge( sample );
71  auto const mass_sum = std::accumulate( masses.begin(), masses.end(), 0.0 );
72 
73  // Collect the placement masses at each link of the tree.
74  // Use init to -1 as indicator for assertions.
75  auto link_masses = std::vector<double>( sample.tree().link_count(), -1.0 );
76 
77  for( auto tree_it : postorder( sample.tree() )) {
78 
79  // Skip the last iteration. We are interested in edges, not in nodes.
80  if( tree_it.is_last_iteration() ) {
81  continue;
82  }
83 
84  // Get the indices of the links at both sides of the current edge.
85  // cur_idx is the link that points away from the root,
86  // out_idx is the link that points towards it (i.e., its subtree contains the root).
87  auto const cur_idx = tree_it.link().index();
88  auto const out_idx = tree_it.link().outer().index();
89 
90  // Assert that we have not processed those links before.
91  assert( link_masses[ cur_idx ] < 0.0 );
92  assert( link_masses[ out_idx ] < 0.0 );
93 
94  // Assert that the cur_idx belongs to the link away from the root.
95  // This is the case if the primary link of its node is the link itself,
96  // because the node uses this link to point towards the root - thus, the link itself
97  // is away from the root, while the out_idx link lies towards it.
98  assert( sample.tree().link_at( cur_idx ).node().primary_link().index() == cur_idx );
99 
100  // Some more ways to do the same assertion, just to be sure.
101  assert( tree_it.edge().index() == sample.tree().link_at( cur_idx ).edge().index() );
102  assert( tree_it.edge().primary_link().index() == out_idx );
103  assert( tree_it.edge().secondary_link().index() == cur_idx );
104 
105  // Leaf links have no mass.
106  if( tree_it.link().is_leaf() ) {
107  link_masses[ cur_idx ] = 0.0;
108 
109  // If the link belongs to an inner node, we calculate its mass as the sum of the masses
110  // of the other links of this node. Those have already been processed, as we are doing
111  // postorder traversal.
112  } else {
113 
114  // Collect the mass.
115  double round_sum = 0.0;
116 
117  // Iterate around all other links of the node that belongs to the cur_idx link.
118  auto round_link = &tree_it.link().next();
119  while( round_link != &tree_it.link() ) {
120 
121  // We are doing postorder traversal, so we should have seen this link before.
122  assert( link_masses[ round_link->index() ] >= 0.0 );
123 
124  // The mass of the subtree behind this link can be calculated from the total mass
125  // minus the mass of the link itself.
126  round_sum += mass_sum - link_masses[ round_link->index() ];
127 
128  // Next link of the node.
129  round_link = &round_link->next();
130  }
131 
132  // The sum should always be >0, but for numerical reaons, we better make sure it is.
133  link_masses[ cur_idx ] = std::max( 0.0, round_sum );
134  }
135 
136  // Calculate the mass at the other side of the edge. We need to correct negative values,
137  // which can occur for numerical reasons (in the order of e-12).
138  link_masses[out_idx] = std::max(
139  0.0,
140  mass_sum - link_masses[ cur_idx ] - masses[ tree_it.edge().index() ]
141  );
142 
143  // Make sure we have processed all masses that we are going to use.
144  assert( link_masses[cur_idx] > -1.0 );
145  assert( link_masses[out_idx] > -1.0 );
146 
147  // Finally, calculate the imbalance of the current edge,
148  // normalized by the total mass on the tree (expect for the mass of the current edge).
149  auto const imbalance = link_masses[cur_idx] - link_masses[out_idx];
150  if( normalize ) {
151  auto const normalizer = mass_sum - masses[ tree_it.edge().index() ];
152  assert( normalizer > 0.0 );
153  vec[ tree_it.edge().index() ] = imbalance / normalizer;
154  } else {
155  vec[ tree_it.edge().index() ] = imbalance;
156  }
157  }
158 
159  return vec;
160 }
161 
162 // =================================================================================================
163 // Edge PCA Imbalance Matrix
164 // =================================================================================================
165 
167  SampleSet const& samples,
168  bool include_leaves,
169  bool normalize
170 ) {
171  // If there are no samples, return empty matrix.
172  if( samples.size() == 0 ) {
173  return utils::Matrix<double>();
174  }
175 
176  // Check if all trees have the same tpology and edge nums.
177  if( ! all_identical_trees( samples )) {
178  throw std::runtime_error(
179  "Cannot calculate Edge PCA on trees that have a different topology."
180  );
181  }
182 
183  assert( samples.size() > 0 );
184  auto const edge_count = samples.at( 0 ).sample.tree().edge_count();
185 
186  if( include_leaves ) {
187 
188  auto imbalance_matrix = utils::Matrix<double>( samples.size(), edge_count );
189 
190  #pragma omp parallel for
191  for( size_t s = 0; s < samples.size(); ++s ) {
192  auto const& smp = samples[s].sample;
193  auto const imbalance_vec = epca_imbalance_vector( smp, normalize );
194 
195  // We need to have the right number of imbalance values.
196  assert( imbalance_vec.size() == edge_count );
197 
198  // Copy imbalance values to the matrix.
199  for( size_t i = 0; i < edge_count; ++i ) {
200 
201  // Either the edge is an inner edge, or (if not, i.e., it leads to a leaf),
202  // it's imbalance is minus 1, as all its mass is on the root side.
203  assert(
204  ! samples[s].sample.tree().edge_at(i).secondary_node().is_leaf() ||
205  utils::almost_equal_relative( imbalance_vec[ i ], -1.0 )
206  );
207 
208  imbalance_matrix( s, i ) = imbalance_vec[ i ];
209  }
210  }
211 
212  return imbalance_matrix;
213 
214  } else {
215 
216  // Get the indices of all edges that do not lead to a tip.
217  auto const inner_edge_indices = tree::inner_edge_indices( samples.at( 0 ).sample.tree() );
218 
219  // Prepare result
220  auto imbalance_matrix = utils::Matrix<double>( samples.size(), inner_edge_indices.size() );
221 
222  #pragma omp parallel for
223  for( size_t s = 0; s < samples.size(); ++s ) {
224  auto const& smp = samples[s].sample;
225  auto const imbalance_vec = epca_imbalance_vector( smp, normalize );
226 
227  // We need to have the right number of imbalance values, which also needs to be
228  // smaller than the number of inner edges (there can be no tree with just inner
229  // edges, thus the total number of edges has to be bigger).
230  assert( imbalance_vec.size() == edge_count );
231  assert( imbalance_vec.size() > inner_edge_indices.size() );
232 
233  // Copy those imbalance values to the matrix that belong to inner edges.
234  for( size_t i = 0; i < inner_edge_indices.size(); ++i ) {
235  auto idx = inner_edge_indices[i];
236  imbalance_matrix( s, i ) = imbalance_vec[ idx ];
237  }
238  }
239 
240  return imbalance_matrix;
241  }
242 }
243 
244 // =================================================================================================
245 // Splitify Transform with Kappa
246 // =================================================================================================
247 
248 void epca_splitify_transform( utils::Matrix<double>& imbalance_matrix, double kappa )
249 {
250  // Precondition check.
251  if( kappa < 0.0 ) {
252  throw std::runtime_error( "Argument for kappa must be non-negative." );
253  }
254 
255  // Save time if the transformation throws away the actual value.
256  // We do not need to calculate the abs and power in this case.
257  if( kappa == 0.0 ) {
258  for( auto& elem : imbalance_matrix ) {
259  elem = static_cast<double>( utils::signum( elem ));
260  }
261  return;
262  }
263 
264  // Save time if the transformation does not change anything.
265  if( kappa == 1.0 ) {
266  return;
267  }
268 
269  // If neither applies, do the full transformation.
270  for( auto& elem : imbalance_matrix ) {
271  elem = static_cast<double>( utils::signum( elem )) * std::pow( std::abs( elem ), kappa );
272  }
273 }
274 
275 // =================================================================================================
276 // Filter Constant Columns
277 // =================================================================================================
278 
279 std::vector<size_t> epca_filter_constant_columns( utils::Matrix<double>& imbalance_matrix, double epsilon )
280 {
281  // Get the column-wise min and max values.
282  auto const col_minmax = utils::matrix_col_minmax( imbalance_matrix );
283 
284  // Store which columns to keep, by index.
285  std::vector<size_t> keep_cols;
286  for( size_t c = 0; c < imbalance_matrix.cols(); ++c ) {
287  assert( col_minmax[c].min <= col_minmax[c].max );
288  if (( col_minmax[c].max - col_minmax[c].min ) > epsilon ) {
289  keep_cols.push_back( c );
290  }
291  }
292  assert( keep_cols.size() <= imbalance_matrix.cols() );
293 
294  // Produce new, filtered matrix.
295  auto new_mat = utils::Matrix<double>( imbalance_matrix.rows(), keep_cols.size() );
296 
297  #pragma omp parallel for
298  for( size_t r = 0; r < imbalance_matrix.rows(); ++r ) {
299  for( size_t i = 0; i < keep_cols.size(); ++i ) {
300  new_mat( r, i ) = imbalance_matrix( r, keep_cols[i] );
301  }
302  }
303 
304  // Overwrite the matrix.
305  imbalance_matrix = new_mat;
306  return keep_cols;
307 }
308 
309 // =================================================================================================
310 // Edge PCA
311 // =================================================================================================
312 
313 EpcaData epca( SampleSet const& samples, double kappa, double epsilon, size_t components )
314 {
315  // If there are no samples, return empty result.
316  if( samples.size() == 0 ) {
317  return EpcaData();
318  }
319 
320  // Calculate the imbalance_matrix.
321  auto imbalance_matrix = epca_imbalance_matrix( samples, false );
322  assert( samples.size() > 0 );
323  assert( imbalance_matrix.rows() == samples.size() );
324  assert( imbalance_matrix.cols() == tree::inner_edge_count( samples[0].sample.tree() ) );
325 
326  // Get the indices of the inner edges.
327  auto const inner_edge_indices = tree::inner_edge_indices( samples.at( 0 ).sample.tree() );
328  assert( imbalance_matrix.cols() == inner_edge_indices.size() );
329 
330  // Filter and transform the imbalance matrix.
331  auto const not_filtered_indices = epca_filter_constant_columns( imbalance_matrix, epsilon );
332  epca_splitify_transform( imbalance_matrix, kappa );
333 
334  // We now use the list of not filtered indices to selected from the list of inner edge indices.
335  // The result is just the indices of the edges that are still in the matrix.
336  std::vector<size_t> edge_indices;
337  for( auto const& not_filt : not_filtered_indices ) {
338  edge_indices.push_back( inner_edge_indices[ not_filt ] );
339  }
340  assert( edge_indices.size() == imbalance_matrix.cols() );
341 
342  // Get correct number of pca components.
343  if( components == 0 || components > imbalance_matrix.cols() ) {
344  components = imbalance_matrix.cols();
345  }
346 
347  // Run and return PCA.
349  imbalance_matrix, components, utils::PcaStandardization::kCovariance
350  );
351  assert( pca.eigenvalues.size() == components );
352  assert( pca.eigenvectors.rows() == edge_indices.size() );
353  assert( pca.eigenvectors.cols() == components );
354  assert( pca.projection.rows() == samples.size() );
355  assert( pca.projection.cols() == components );
356 
357  // Move data.
358  EpcaData result;
359  result.eigenvalues = std::move( pca.eigenvalues );
360  result.eigenvectors = std::move( pca.eigenvectors );
361  result.projection = std::move( pca.projection );
362  result.edge_indices = std::move( edge_indices );
363  return result;
364 }
365 
366 } // namespace placement
367 } // namespace genesis
size_t index() const
Return the index of this Edge.
Definition: edge.cpp:45
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
std::vector< MinMaxPair< T > > matrix_col_minmax(Matrix< T > const &data)
Calculate the column-wise min and max values of a Matrix.
Definition: math/matrix.hpp:70
size_t inner_edge_count(Tree const &tree)
Return the number of Edges of a Tree that do not lead to a leaf Node.
Standardize the mean, but not the variance of the data before performing the PCA. ...
std::vector< double > placement_weight_per_edge(Sample const &sample)
Return a vector that contains the sum of the weights of the PqueryPlacements per edge of the tree of ...
constexpr int signum(T x, std::false_type)
Implementation of signum(T x) for unsigned types. See there for details.
Definition: common.hpp:71
utils::Matrix< double > epca_imbalance_matrix(SampleSet const &samples, bool include_leaves, bool normalize)
Calculate the imbalance matrix of placment mass for all Samples in a SampleSet.
Definition: epca.cpp:166
bool all_identical_trees(SampleSet const &sample_set)
Returns true iff all Trees of the Samples in the set are identical.
Definition: sample_set.cpp:124
Provides some valuable additions to STD.
TreeLink & link_at(size_t index)
Return the TreeLink at a certain index.
Definition: tree/tree.cpp:284
Matrix operators.
TreeLink & primary_link()
Return the TreeLink that points towards the root.
Definition: node.cpp:56
EpcaData epca(SampleSet const &samples, double kappa, double epsilon, size_t components)
Perform EdgePCA on a SampleSet.
Definition: epca.cpp:313
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
Helper stucture that collects the output of epca().
Definition: epca.hpp:76
std::vector< double > eigenvalues
Definition: epca.hpp:78
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.hpp:234
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
void normalize(Histogram &h, double total)
Definition: operations.cpp:61
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
std::vector< double > epca_imbalance_vector(Sample const &sample, bool normalize)
Calculate the imbalance of placement mass for each Edge of the given Sample.
Definition: epca.cpp:64
NamedSample & at(size_t index)
Get the NamedSample at a certain index position.
Definition: sample_set.hpp:194
PcaData principal_component_analysis(Matrix< double > const &data, size_t components, PcaStandardization standardization)
Perfom a Principal Component Analysis on a given data Matrix.
Definition: pca.cpp:242
void epca_splitify_transform(utils::Matrix< double > &imbalance_matrix, double kappa)
Perform a component-wise transformation of the imbalance matrix used for epca().
Definition: epca.cpp:248
utils::Range< IteratorPostorder< TreeLink const, TreeNode const, TreeEdge const > > postorder(ElementType const &element)
std::vector< size_t > epca_filter_constant_columns(utils::Matrix< double > &imbalance_matrix, double epsilon)
Filter out columns that have nearly constant values, measured using an epsilon.
Definition: epca.cpp:279
std::vector< size_t > inner_edge_indices(Tree const &tree)
Get a list of the edge indices of all inner edges, that is, all TreeEdges that do not lead to a leaf ...
size_t link_count() const
Return the number of TreeLinks of the Tree.
Definition: tree/tree.cpp:342
size_t edge_count(Tree const &tree)
Return the number of Edges of a Tree. Same as Tree::edge_count().
bool almost_equal_relative(double lhs, double rhs, double max_rel_diff=std::numeric_limits< double >::epsilon())
Check whether two doubles are almost equal, using a relative epsilon to compare them.
Definition: common.hpp:108