A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
epca.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
38 
45 
47 
48 #include <algorithm>
49 #include <cassert>
50 #include <cmath>
51 #include <numeric>
52 #include <stdexcept>
53 #include <vector>
54 
55 #ifdef GENESIS_OPENMP
56 # include <omp.h>
57 #endif
58 
59 namespace genesis {
60 namespace placement {
61 
62 // =================================================================================================
63 // Edge PCA Imbalance Vector
64 // =================================================================================================
65 
66 std::vector<double> epca_imbalance_vector( Sample const& sample, bool normalize )
67 {
68  // Result vector: imbalance of masses at each edge of the tree.
69  auto vec = std::vector<double>( sample.tree().edge_count(), 0.0 );
70 
71  // We need the masses per edge, and their sum, for later.
72  auto const masses = placement_weight_per_edge( sample );
73  auto const mass_sum = std::accumulate( masses.begin(), masses.end(), 0.0 );
74 
75  // Collect the placement masses at each link of the tree.
76  // Use init to -1 as indicator for assertions.
77  auto link_masses = std::vector<double>( sample.tree().link_count(), -1.0 );
78 
79  for( auto tree_it : postorder( sample.tree() )) {
80 
81  // Skip the last iteration. We are interested in edges, not in nodes.
82  if( tree_it.is_last_iteration() ) {
83  continue;
84  }
85 
86  // Get the indices of the links at both sides of the current edge.
87  // cur_idx is the link that points away from the root,
88  // out_idx is the link that points towards it (i.e., its subtree contains the root).
89  auto const cur_idx = tree_it.link().index();
90  auto const out_idx = tree_it.link().outer().index();
91 
92  // Assert that we have not processed those links before.
93  assert( link_masses[ cur_idx ] < 0.0 );
94  assert( link_masses[ out_idx ] < 0.0 );
95 
96  // Assert that the cur_idx belongs to the link away from the root.
97  // This is the case if the primary link of its node is the link itself,
98  // because the node uses this link to point towards the root - thus, the link itself
99  // is away from the root, while the out_idx link lies towards it.
100  assert( sample.tree().link_at( cur_idx ).node().primary_link().index() == cur_idx );
101 
102  // Some more ways to do the same assertion, just to be sure.
103  assert( tree_it.edge().index() == sample.tree().link_at( cur_idx ).edge().index() );
104  assert( tree_it.edge().primary_link().index() == out_idx );
105  assert( tree_it.edge().secondary_link().index() == cur_idx );
106 
107  // Leaf links have no mass.
108  if( tree_it.link().is_leaf() ) {
109  link_masses[ cur_idx ] = 0.0;
110 
111  // If the link belongs to an inner node, we calculate its mass as the sum of the masses
112  // of the other links of this node. Those have already been processed, as we are doing
113  // postorder traversal.
114  } else {
115 
116  // Collect the mass.
117  double round_sum = 0.0;
118 
119  // Iterate around all other links of the node that belongs to the cur_idx link.
120  auto round_link = &tree_it.link().next();
121  while( round_link != &tree_it.link() ) {
122 
123  // We are doing postorder traversal, so we should have seen this link before.
124  assert( link_masses[ round_link->index() ] >= 0.0 );
125 
126  // The mass of the subtree behind this link can be calculated from the total mass
127  // minus the mass of the link itself.
128  round_sum += mass_sum - link_masses[ round_link->index() ];
129 
130  // Next link of the node.
131  round_link = &round_link->next();
132  }
133 
134  // The sum should always be >0, but for numerical reaons, we better make sure it is.
135  link_masses[ cur_idx ] = std::max( 0.0, round_sum );
136  }
137 
138  // Calculate the mass at the other side of the edge. We need to correct negative values,
139  // which can occur for numerical reasons (in the order of e-12).
140  link_masses[out_idx] = std::max(
141  0.0,
142  mass_sum - link_masses[ cur_idx ] - masses[ tree_it.edge().index() ]
143  );
144 
145  // Make sure we have processed all masses that we are going to use.
146  assert( link_masses[cur_idx] > -1.0 );
147  assert( link_masses[out_idx] > -1.0 );
148 
149  // Finally, calculate the imbalance of the current edge,
150  // normalized by the total mass on the tree (expect for the mass of the current edge).
151  auto const imbalance = link_masses[cur_idx] - link_masses[out_idx];
152  if( normalize ) {
153  auto const normalize = mass_sum - masses[ tree_it.edge().index() ];
154  assert( normalize > 0.0 );
155  vec[ tree_it.edge().index() ] = imbalance / normalize;
156  } else {
157  vec[ tree_it.edge().index() ] = imbalance;
158  }
159  }
160 
161  return vec;
162 }
163 
164 // =================================================================================================
165 // Edge PCA Imbalance Matrix
166 // =================================================================================================
167 
168 utils::Matrix<double> epca_imbalance_matrix( SampleSet const& samples, bool include_leaves )
169 {
170  // If there are no samples, return empty matrix.
171  if( samples.size() == 0 ) {
172  return utils::Matrix<double>();
173  }
174 
175  // Check if all trees have the same tpology and edge nums.
176  if( ! all_identical_trees( samples )) {
177  throw std::runtime_error(
178  "Cannot calculate Edge PCA on trees that have a different topology."
179  );
180  }
181 
182  assert( samples.size() > 0 );
183  auto const edge_count = samples.at( 0 ).sample.tree().edge_count();
184 
185  if( include_leaves ) {
186 
187  auto imbalance_matrix = utils::Matrix<double>( samples.size(), edge_count );
188 
189  #pragma omp parallel for
190  for( size_t s = 0; s < samples.size(); ++s ) {
191  auto const& smp = samples[s].sample;
192  auto const imbalance_vec = epca_imbalance_vector( smp, true );
193 
194  // We need to have the right number of imbalance values.
195  assert( imbalance_vec.size() == edge_count );
196 
197  // Copy imbalance values to the matrix.
198  for( size_t i = 0; i < edge_count; ++i ) {
199 
200  // Either the edge is an inner edge, or (if not, i.e., it leads to a leaf),
201  // it's imbalance is minus 1, as all its mass is on the root side.
202  assert(
203  ! samples[s].sample.tree().edge_at(i).secondary_node().is_leaf() ||
204  utils::almost_equal_relative( imbalance_vec[ i ], -1.0 )
205  );
206 
207  imbalance_matrix( s, i ) = imbalance_vec[ i ];
208  }
209  }
210 
211  return imbalance_matrix;
212 
213  } else {
214 
215  // Get the indices of all edges that do not lead to a tip.
216  std::vector<size_t> inner_edge_indices;
217  for( auto const& edge_it : samples.at( 0 ).sample.tree().edges() ) {
218  if( edge_it->secondary_node().is_inner() ) {
219  inner_edge_indices.push_back( edge_it->index() );
220  }
221  }
222 
223  auto imbalance_matrix = utils::Matrix<double>( samples.size(), inner_edge_indices.size() );
224 
225  #pragma omp parallel for
226  for( size_t s = 0; s < samples.size(); ++s ) {
227  auto const& smp = samples[s].sample;
228  auto const imbalance_vec = epca_imbalance_vector( smp, true );
229 
230  // We need to have the right number of imbalance values, which also needs to be
231  // smaller than the number of inner edges (there can be no tree with just inner
232  // edges, thus the total number of edges has to be bigger).
233  assert( imbalance_vec.size() == edge_count );
234  assert( imbalance_vec.size() > inner_edge_indices.size() );
235 
236  // Copy those imbalance values to the matrix that belong to inner edges.
237  for( size_t i = 0; i < inner_edge_indices.size(); ++i ) {
238  auto idx = inner_edge_indices[i];
239  imbalance_matrix( s, i ) = imbalance_vec[ idx ];
240  }
241  }
242 
243  return imbalance_matrix;
244  }
245 }
246 
247 // =================================================================================================
248 // Splitify Transform with Kappa
249 // =================================================================================================
250 
251 void epca_splitify_transform( utils::Matrix<double>& imbalance_matrix, double kappa )
252 {
253  // Precondition check.
254  if( kappa < 0.0 ) {
255  throw std::runtime_error( "Argument for kappa must be non-negative." );
256  }
257 
258  // Save time if the transformation throws away the actual value.
259  // We do not need to calculate the abs and power in this case.
260  if( kappa == 0.0 ) {
261  for( auto& elem : imbalance_matrix ) {
262  elem = static_cast<double>( utils::signum( elem ));
263  }
264  return;
265  }
266 
267  // Save time if the transformation does not change anything.
268  if( kappa == 1.0 ) {
269  return;
270  }
271 
272  // If neither applies, do the full transformation.
273  for( auto& elem : imbalance_matrix ) {
274  elem = static_cast<double>( utils::signum( elem )) * std::pow( std::abs( elem ), kappa );
275  }
276 }
277 
278 // =================================================================================================
279 // Filter Constant Columns
280 // =================================================================================================
281 
282 std::vector<size_t> epca_filter_constant_columns( utils::Matrix<double>& imbalance_matrix, double epsilon )
283 {
284  // Get the column-wise min and max values.
285  auto const col_minmax = utils::matrix_col_minmax( imbalance_matrix );
286 
287  // Store which columns to keep, by index.
288  std::vector<size_t> keep_cols;
289  for( size_t c = 0; c < imbalance_matrix.cols(); ++c ) {
290  assert( col_minmax[c].min <= col_minmax[c].max );
291  if (( col_minmax[c].max - col_minmax[c].min ) > epsilon ) {
292  keep_cols.push_back( c );
293  }
294  }
295  assert( keep_cols.size() <= imbalance_matrix.cols() );
296 
297  // Produce new, filtered matrix.
298  auto new_mat = utils::Matrix<double>( imbalance_matrix.rows(), keep_cols.size() );
299 
300  #pragma omp parallel for
301  for( size_t r = 0; r < imbalance_matrix.rows(); ++r ) {
302  for( size_t i = 0; i < keep_cols.size(); ++i ) {
303  new_mat( r, i ) = imbalance_matrix( r, keep_cols[i] );
304  }
305  }
306 
307  // Overwrite the matrix.
308  imbalance_matrix = new_mat;
309  return keep_cols;
310 }
311 
312 // =================================================================================================
313 // Edge PCA
314 // =================================================================================================
315 
316 utils::PcaData epca( SampleSet const& samples, double kappa, double epsilon, size_t components )
317 {
318  // If there are no samples, return empty result.
319  if( samples.size() == 0 ) {
320  return utils::PcaData();
321  }
322 
323  // Calculate the imbalance_matrix.
324  auto imbalance_matrix = epca_imbalance_matrix( samples, false );
325  assert( samples.size() > 0 );
326  assert( imbalance_matrix.rows() == samples.size() );
327  assert( imbalance_matrix.cols() == tree::inner_edge_count( samples[0].sample.tree() ) );
328 
329  // Filter and transform the imbalance matrix.
330  epca_filter_constant_columns( imbalance_matrix, epsilon );
331  epca_splitify_transform( imbalance_matrix, kappa );
332 
333  // Get correct number of pca components.
334  if( components == 0 || components > imbalance_matrix.cols() ) {
335  components = imbalance_matrix.cols();
336  }
337 
338  // Run and return PCA.
340  imbalance_matrix, components, utils::PcaStandardization::kCovariance
341  );
342 }
343 
344 } // namespace placement
345 } // namespace genesis
size_t index() const
Return the index of this Edge.
Definition: edge.cpp:45
utils::Matrix< double > epca_imbalance_matrix(SampleSet const &samples, bool include_leaves)
Calculate the imbalance matrix of placment mass for all Samples in a SampleSet.
Definition: epca.cpp:168
size_t cols() const
Definition: matrix.hpp:156
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
std::vector< MinMaxPair< T > > matrix_col_minmax(Matrix< T > const &data)
Calculate the column-wise min and max values of a Matrix.
Definition: statistics.hpp:71
size_t inner_edge_count(Tree const &tree)
Return the number of Edges of a Tree that do not lead to a leaf Node.
Standardize the mean, but not the variance of the data before performing the PCA. ...
std::vector< double > placement_weight_per_edge(Sample const &sample)
Return a vector that contains the sum of the weights of the PqueryPlacements per edge of the tree of ...
constexpr int signum(T x, std::false_type)
Implementation of signum(T x) for unsigned types. See there for details.
Definition: common.hpp:240
bool all_identical_trees(SampleSet const &sample_set)
Returns true iff all Trees of the Samples in the set are identical.
Helper stucture that collects the output of principal_component_analysis().
Definition: pca.hpp:95
Provides some valuable additions to STD.
TreeLink & link_at(size_t index)
Return the TreeLink at a certain index.
Definition: tree/tree.cpp:284
TreeLink & primary_link()
Return the TreeLink that points towards the root.
Definition: node.cpp:56
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Matrix statstic functions.
Store a set of Samples with associated names.
Definition: sample_set.hpp:52
NamedSample & at(size_t index)
Get the NamedSample at a certain index position.
Definition: sample_set.cpp:110
Provides easy and fast logging functionality.
size_t size() const
Return the size of the SampleSet, i.e., the number of Samples.
Definition: sample_set.cpp:135
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
void normalize(Histogram &h, double total)
Definition: operations.cpp:61
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
size_t rows() const
Definition: matrix.hpp:151
std::vector< double > epca_imbalance_vector(Sample const &sample, bool normalize)
Calculate the imbalance of placement mass for each Edge of the given Sample.
Definition: epca.cpp:66
Matrix operators.
PcaData principal_component_analysis(Matrix< double > const &data, size_t components, PcaStandardization standardization)
Perfom a Principal Component Analysis on a given data Matrix.
Definition: pca.cpp:242
void epca_splitify_transform(utils::Matrix< double > &imbalance_matrix, double kappa)
Perform a component-wise transformation of the imbalance matrix used for epca().
Definition: epca.cpp:251
utils::Range< IteratorPostorder< TreeLink const, TreeNode const, TreeEdge const > > postorder(ElementType const &element)
std::vector< size_t > epca_filter_constant_columns(utils::Matrix< double > &imbalance_matrix, double epsilon)
Filter out columns that have nearly constant values, measured using an epsilon.
Definition: epca.cpp:282
utils::PcaData epca(SampleSet const &samples, double kappa, double epsilon, size_t components)
Definition: epca.cpp:316
size_t link_count() const
Return the number of TreeLinks of the Tree.
Definition: tree/tree.cpp:342
bool almost_equal_relative(double lhs, double rhs, double max_rel_diff=std::numeric_limits< double >::epsilon())
Check whether two doubles are almost equal, using a relative epsilon to compare them.
Definition: common.hpp:277