A toolkit for working with phylogenetic data.
v0.24.0
balances.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 #include "genesis/tree/tree.hpp"
39 
43 
44 #include <algorithm>
45 #include <cassert>
46 #include <cmath>
47 #include <functional>
48 #include <limits>
49 #include <stdexcept>
50 #include <unordered_set>
51 #include <vector>
52 
53 #ifdef GENESIS_OPENMP
54 # include <omp.h>
55 #endif
56 
57 namespace genesis {
58 namespace tree {
59 
60 // =================================================================================================
61 // Balances Data Calculation
62 // =================================================================================================
63 
65  std::vector<MassTree> const& trees,
66  BalanceSettings settings
67 ) {
68  // Prepare (empty) result.
69  BalanceData result;
70 
71  // Basic checks.
72  if( trees.empty() ) {
73  return result;
74  }
75  for( auto const& tree : trees ) {
76  if( ! tree_data_is< MassTreeNodeData, MassTreeEdgeData >( tree ) ) {
77  throw std::invalid_argument(
78  "Tree is not a MassTree. Cannot calculate its balance data."
79  );
80  }
81  }
82  if( ! identical_topology( trees )) {
83  throw std::invalid_argument(
84  "Trees do not have identical topology. Cannot calculate their balance data."
85  );
86  }
87  if(
88  ! std::isfinite( settings.pseudo_count_summand_all ) ||
89  ! std::isfinite( settings.pseudo_count_summand_zeros ) ||
90  settings.pseudo_count_summand_all < 0.0 ||
91  settings.pseudo_count_summand_zeros < 0.0
92  ) {
93  throw std::invalid_argument(
94  "Pseudo-count summands in balance settings have to be non-negative numbers."
95  );
96  }
97 
98  // Make a nice copy of the tree.
100 
101  // First, get the raw (total) masses. We make the relative later on,
102  // because first we might need the raw ones for the tendency and norm terms of the taxon weights.
103  result.edge_masses = mass_tree_mass_per_edge( trees );
104  assert( result.edge_masses.rows() == trees.size() );
105  assert( result.edge_masses.cols() == result.tree.edge_count() );
106 
107  // Check that we do not use normalized masses here.
108  // For numerical reasons, we use a threshold of 1.1 here, to be sure.
109  // This means that metagenomic samples with less than 1.1 sequences cannot be processed.
110  // We can live with that.
111  for( size_t r = 0; r < result.edge_masses.rows(); ++r ) {
112  auto const sum = std::accumulate(
113  result.edge_masses.row(r).begin(), result.edge_masses.row(r).end(), 0.0
114  );
115  if( sum < 1.1 ) {
116  throw std::runtime_error(
117  "Cannot calculate balance data on trees with normalized masses."
118  );
119  }
120  }
121 
122  // Calculate the central tendency of each taxon (column) of the edge masses.
123  // If we do not use the tendency term, we default to 1.0.
124  // If we only have one tree, we cannot calculate any weights, so we can also skip this.
125  auto tendencies = std::vector<double>( result.tree.edge_count(), 1.0 );
126  if( trees.size() > 1 && settings.tendency != BalanceSettings::WeightTendency::kNone ) {
127 
128  #pragma omp parallel for
129  for( size_t c = 0; c < tendencies.size(); ++c ) {
130  switch( settings.tendency ) {
132  // Can't happen, as we exluded this alreay above.
133  assert( false );
134  break;
135  }
137  tendencies[c] = utils::median(
138  result.edge_masses.col(c).begin(), result.edge_masses.col(c).end()
139  );
140  break;
141  }
143  tendencies[c] = utils::arithmetic_mean(
144  result.edge_masses.col(c).begin(), result.edge_masses.col(c).end()
145  );
146  break;
147  }
149  // For the geometric mean of the raw counts, we add one to avoid zeros.
150  // This is in accordance with Silverman et al.
151  auto raw = result.edge_masses.col(c).to_vector();
152  for( auto& e : raw ) {
153  e += 1.0;
154  }
155  tendencies[c] = utils::geometric_mean( raw );
156  break;
157  }
158  default: {
159  assert( false );
160  }
161  }
162  assert( std::isfinite( tendencies[c] ));
163  assert( tendencies[c] >= 0.0 );
164  }
165  }
166 
167  // Caluclate the norm of the relative abundances across all trees. In Silverman et al.,
168  // it is not explicitly stated whether they calculate the norm on relative abundances with or
169  // without pseudo counts. Probably they do not use pseudo counts. So, we do the same here,
170  // which requires to make a copy of the masses...
171  // If we do not use the norm term, we default to 1.0.
172  auto norms = std::vector<double>( result.tree.edge_count(), 1.0 );
173  if( trees.size() > 1 && settings.norm != BalanceSettings::WeightNorm::kNone ) {
174 
175  // Make a copy of the masses and calcualte relative abundances per row
176  // without any pseudo counts.
177  auto edge_masses_cpy = result.edge_masses;
178  assert( edge_masses_cpy.cols() == result.tree.edge_count() );
179  assert( edge_masses_cpy.cols() == norms.size() );
180  for( size_t r = 0; r < edge_masses_cpy.rows(); ++r ) {
181  utils::closure( edge_masses_cpy.row(r).begin(), edge_masses_cpy.row(r).end() );
182  }
183 
184  // Calculate the norm on these masses.
185  #pragma omp parallel for
186  for( size_t c = 0; c < norms.size(); ++c ) {
187 
188  // Get iterators, so that we avoid copying the vectors.
189  auto em_beg = edge_masses_cpy.col(c).begin();
190  auto em_end = edge_masses_cpy.col(c).end();
191 
192  switch( settings.norm ) {
194  // Can't happen, as we exluded this alreay above.
195  assert( false );
196  break;
197  }
199  norms[c] = utils::manhattan_norm( em_beg, em_end );
200  break;
201  }
203  norms[c] = utils::euclidean_norm( em_beg, em_end );
204  break;
205  }
207  norms[c] = utils::maximum_norm( em_beg, em_end );
208  break;
209  }
211  norms[c] = utils::aitchison_norm( em_beg, em_end );
212  break;
213  }
214  default: {
215  assert( false );
216  }
217  }
218  assert( std::isfinite( norms[c] ));
219  assert( norms[c] >= 0.0 );
220  }
221  }
222 
223  // Calculate taxon weights as the product of tendency and norm per edge.
224  // We only do that for more than one tree, otherwise we leave the weights empty.
225  if( trees.size() > 1 ) {
226  result.taxon_weights = std::vector<double>( result.tree.edge_count() );
227  assert( result.taxon_weights.size() == tendencies.size() );
228  assert( result.taxon_weights.size() == norms.size() );
229  assert( result.taxon_weights.size() == result.tree.edge_count() );
230 
231  for( size_t c = 0; c < result.taxon_weights.size(); ++c ) {
232  assert( std::isfinite( tendencies[c] ) && tendencies[c] >= 0.0 );
233  assert( std::isfinite( norms[c] ) && norms[c] >= 0.0 );
234 
235  result.taxon_weights[c] = tendencies[c] * norms[c];
236  assert( std::isfinite( result.taxon_weights[c] ) && result.taxon_weights[c] >= 0.0 );
237  }
238  }
239 
240  // Now, we can add compensation of zero values in form of pseudo counts.
241  bool found_zero_mass = false;
242  for( auto& em : result.edge_masses ) {
243  assert( std::isfinite( em ) && em >= 0.0 );
244  if( em == 0.0 ) {
245  found_zero_mass = true;
246  em += settings.pseudo_count_summand_zeros;
247  }
248  em += settings.pseudo_count_summand_all;
249  }
250  if(
251  found_zero_mass &&
252  settings.pseudo_count_summand_zeros == 0.0 &&
253  settings.pseudo_count_summand_all == 0.0
254  ) {
255  throw std::runtime_error(
256  "The balance data contains edge masses with value 0, but no pseudo counts were used. "
257  "As such 0 values cannot be used in the balances calculations, you must add pseudo "
258  "counts to them, e.g., via the BalanceSettings."
259  );
260  }
261 
262  // So far, we have only added pseudo counts to the edge masses, but no other transformations.
263  // This is what we want for the raw masses data. Store it.
264  // Then, calculate the closure (relative abundances) per row (tree) of the masses.
265  result.raw_edge_masses = result.edge_masses;
266  for( size_t r = 0; r < result.edge_masses.rows(); ++r ) {
267  utils::closure( result.edge_masses.row(r).begin(), result.edge_masses.row(r).end() );
268  }
269 
270  // If needed, take taxon weights into account for the masses.
271  if( ! result.taxon_weights.empty() ) {
272  assert( trees.size() > 1 );
273  assert( result.taxon_weights.size() == result.edge_masses.cols() );
274 
275  // Get the minimum, which we use as a dummy for taxon weights of zero.
276  auto const em_min = utils::finite_minimum(
277  result.edge_masses.begin(), result.edge_masses.end()
278  );
279 
280  #pragma omp parallel for
281  for( size_t r = 0; r < result.edge_masses.rows(); ++r ) {
282  for( size_t c = 0; c < result.taxon_weights.size(); ++c ) {
283  auto& edge_mass = result.edge_masses( r, c );
284  auto const& taxon_weight = result.taxon_weights[c];
285 
286  // We already made sure that this holds, in the part where pseudo counts are added.
287  assert( std::isfinite( edge_mass ) && ( edge_mass > 0.0 ));
288 
289  // We also already made this sure.
290  assert( std::isfinite( taxon_weight ) && taxon_weight >= 0.0 );
291 
292  // The weights are divided instead of multiplied, which is counter-intuitive, but
293  // correct according to pers. comm. with Justin Silverman, who referred to
294  // > J. J. Egozcue and V. Pawlowsky-Glahn,
295  // > "Changing the Reference Measure in the Simplex and its Weighting Effects,"
296  // > Austrian J. Stat., vol. 45, no. 4, p. 25, Jul. 2016.
297  // for details.
298  // We also need a special case for taxon weights of 0. Those cannot happen in the
299  // original code, because they base their masses on OTU counts of the data,
300  // so there is at least one sample that has a mass > 0 for the taxon.
301  // However, as we use a fixed reference tree, it might happen that there are
302  // branches/taxa where there is not a single placement in any of the samples.
303  // In that case, the taxon weight is 0, and we instead use the pseudo counts as
304  // replacement masses in order to avoid numerical issues with infinities etc.
305  // Basically, it doesn't matter what (finite) value we set the mass to in that case,
306  // as it will be completely ignored because of the 0 weight later on in the
307  // geometric mean calculation anyway... but still, better to set it to something
308  // reasonable!
309  if( taxon_weight == 0.0 ) {
310  // auto const& ps_zeros = settings.pseudo_count_summand_zeros;
311  // auto const& ps_all = settings.pseudo_count_summand_all;
312 
313  // The pseudo counts cannot be both 0 if we are here. We know that we are at a
314  // taxon with no masses at all, so the above found_zero_mass check was triggered.
315  // If then above both pseudo counts were 0, we'd have had an exception by now.
316  // assert(( ps_zeros > 0.0 ) || ( ps_all > 0.0 ));
317  // edge_mass = ps_zeros + ps_all;
318 
319  // Use a dummy weight.
320  edge_mass = em_min;
321  } else {
322  edge_mass /= taxon_weight;
323  }
324  assert( std::isfinite( edge_mass ) && ( edge_mass > 0.0 ));
325  }
326  }
327  }
328 
329  // Assert the result sizes again, just to have that stated explicitly somewhere.
330  assert( result.edge_masses.rows() == trees.size() );
331  assert( result.edge_masses.cols() == result.tree.edge_count() );
332  assert(( trees.size() == 1 ) xor ( result.taxon_weights.size() == result.tree.edge_count() ));
333 
334  return result;
335 }
336 
338  MassTree const& tree,
339  BalanceSettings settings
340 ) {
341  // Ugly, but we need to make a copy to avoid a lot of code duplication.
342  // At least, it's just a single tree.
343  auto const result = mass_balance_data( std::vector<MassTree>{ tree }, settings );
344 
345  assert( identical_topology( result.tree, tree ));
346  assert( result.edge_masses.rows() == 1 );
347  assert( result.edge_masses.cols() == tree.edge_count() );
348  assert( result.taxon_weights.empty() );
349 
350  return result;
351 }
352 
353 // =================================================================================================
354 // Balance Calculation
355 // =================================================================================================
356 
357 std::vector<double> mass_balance(
358  BalanceData const& data,
359  std::unordered_set<size_t> const& numerator_edge_indices,
360  std::unordered_set<size_t> const& denominator_edge_indices
361 ) {
362  auto result = std::vector<double>( data.edge_masses.rows(), 0.0 );
363 
364  #pragma omp parallel for
365  for( size_t r = 0; r < data.edge_masses.rows(); ++r ) {
366  result[r] = mass_balance( data, numerator_edge_indices, denominator_edge_indices, r );
367  }
368 
369  return result;
370 }
371 
373  BalanceData const& data,
374  std::unordered_set<size_t> const& numerator_edge_indices,
375  std::unordered_set<size_t> const& denominator_edge_indices,
376  size_t tree_index
377 ) {
378  // Consistency and input checks.
379  if( data.tree.empty() ) {
380  throw std::runtime_error(
381  "Invalid BalanceData: Cannot calculate balance of an empty tree."
382  );
383  }
384  if( ! data.taxon_weights.empty() && data.taxon_weights.size() != data.edge_masses.cols() ) {
385  throw std::runtime_error(
386  "Invalid BalanceData: Taxon weights need to have same size as edge masses."
387  );
388  }
389  if( data.edge_masses.cols() != data.tree.edge_count() ) {
390  throw std::runtime_error(
391  "Invalid BalanceData: Edge masses need to have same size as the Tree has edges."
392  );
393  }
394  if( numerator_edge_indices.empty() || denominator_edge_indices.empty() ) {
395  throw std::invalid_argument( "Cannot calculate mass balance of empty edge sets." );
396  }
397  if( tree_index >= data.edge_masses.rows() ) {
398  throw std::invalid_argument(
399  "Cannot calculate mass balance with tree index greater than number of trees."
400  );
401  }
402 
403  // Helper struct for more expressive code.
404  struct BalanceTerms
405  {
406  double mean;
407  double scaling;
408  };
409 
410  // Helper function that calculates the geom mean of the `edge_masses` of the given edge indices,
411  // and the scaling term n used for normalization of the balance.
412  // This would be a perfect fit for range filters. But we don't have them in CPP11,
413  // so we need to make copies instead...
414  auto calc_mass_mean_and_scaling_ = [ &data, &tree_index ](
415  std::unordered_set<size_t> const& indices
416  ){
417  // Prepare temporary vectors that store copies of the masses and weights.
418  std::vector<double> sub_masses;
419  std::vector<double> sub_weights;
420  sub_masses.reserve( indices.size() );
421  sub_weights.reserve( indices.size() );
422 
423  // Add the mases and weights to the vectors.
424  for( auto idx : indices ) {
425 
426  // Collect masses at the edge indices.
427  if( idx >= data.edge_masses.cols() ) {
428  throw std::runtime_error( "Invalid edge index in mass balance calculation." );
429  }
430  assert( tree_index < data.edge_masses.rows() );
431  assert( std::isfinite( data.edge_masses( tree_index, idx )));
432  assert( data.edge_masses( tree_index, idx ) >= 0.0 );
433  sub_masses.push_back( data.edge_masses( tree_index, idx ));
434 
435  // Collect weights at the edge indices. If we do not have weights, use 1.0 instead.
436  if( data.taxon_weights.empty() ) {
437  sub_weights.push_back( 1.0 );
438  } else {
439  assert( idx < data.taxon_weights.size() );
440  assert( std::isfinite( data.taxon_weights[idx] ));
441  sub_weights.push_back( data.taxon_weights[idx] );
442  }
443  }
444  assert( sub_masses.size() == indices.size() );
445  assert( sub_weights.size() == indices.size() );
446 
447  // Calculate the mean and the n for scaling.
448  // Check if all weights are zero, which can happen for subtrees with no placements at all.
449  // Set them to nan, so that subsequent calculations simply ignore these values.
450  double scaling_n = std::accumulate( sub_weights.begin(), sub_weights.end(), 0.0 );
451  assert( std::isfinite( scaling_n ));
452  double geom_mean;
453  if( scaling_n > 0.0 ) {
454  geom_mean = utils::weighted_geometric_mean( sub_masses, sub_weights );
455  assert( std::isfinite( geom_mean ));
456  } else {
457  geom_mean = std::numeric_limits<double>::quiet_NaN();
458  }
459 
460  // If we have no edge weights, the scaling terms should be the same as the number of edges.
461  assert(
462  ! data.taxon_weights.empty() ||
463  utils::almost_equal_relative( scaling_n, indices.size(), 0.1 )
464  );
465 
466  return BalanceTerms{ geom_mean, scaling_n };
467  };
468 
469  // Get geometric means of edge subset masses, and the weighted scaling terms.
470  auto const num = calc_mass_mean_and_scaling_( numerator_edge_indices );
471  auto const den = calc_mass_mean_and_scaling_( denominator_edge_indices );
472  assert( std::isnan( num.mean ) || ( num.mean > 0.0 ));
473  assert( std::isnan( den.mean ) || ( den.mean > 0.0 ));
474 
475  // Calculate the balance.
476  double const scaling = std::sqrt(( num.scaling * den.scaling ) / ( num.scaling + den.scaling ));
477  double const balance = scaling * std::log( num.mean / den.mean );
478  assert( std::isnan( balance ) || std::isfinite( balance ));
479 
480  return balance;
481 }
482 
483 } // namespace tree
484 } // namespace genesis
double geometric_mean(ForwardIterator first, ForwardIterator last)
Calculate the geometric mean of a range of positive numbers.
Definition: statistics.hpp:563
double euclidean_norm(ForwardIterator first, ForwardIterator last)
Calculate the Euclidean norm (L2 norm) of a range of numbers.
Definition: distance.hpp:154
Use the Aitchison norm of the relative abundances of the taxon.
double finite_minimum(ForwardIterator first, ForwardIterator last)
Return the minimum of a range of double values.
Definition: statistics.hpp:175
Use the geometric mean of the taxon masses.
Use the Euclidean norm of the relative abundances of the taxon.
Use the Maximum norm of the relative abundances of the taxon.
double manhattan_norm(ForwardIterator first, ForwardIterator last)
Calculate the Manhattan norm (L1 norm) of a range of numbers.
Definition: distance.hpp:131
WeightTendency tendency
Set the term for asssing the central tendency of taxon masses for calculating the taxon weights...
Definition: balances.hpp:140
WeightNorm norm
Set the term for the norm of relative abundances for calculating the taxon weights.
Definition: balances.hpp:145
double aitchison_norm(ForwardIterator first, ForwardIterator last)
Calculate the Aitchison norm of a range of positive numbers.
Definition: distance.hpp:216
BalanceData mass_balance_data(std::vector< MassTree > const &trees, BalanceSettings settings)
Calculate the data needed to calculate balances on MassTrees.
Definition: balances.cpp:64
Tree operator functions.
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.hpp:194
utils::Matrix< double > raw_edge_masses
The absolute raw per-edge masses per original input Tree.
Definition: balances.hpp:213
CommonTree functions.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
double median(const Histogram &h)
double sum(const Histogram &h)
double pseudo_count_summand_zeros
Set the constant that is added to taxon masses that are zero, to avoid zero counts.
Definition: balances.hpp:155
Tree average_branch_length_tree(std::vector< Tree > const &tset)
Return a Tree where the branch lengths are the average of the Trees in the given vector of Trees or T...
Data needed to calculate balances.
Definition: balances.hpp:170
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
Use no norm, that is, use 1.0 for this term.
Use no tendency, that is, use 1.0 for this term.
void closure(ForwardIterator first, ForwardIterator last)
Calculate the closure of a range of numbers.
Definition: statistics.hpp:288
std::vector< double > mass_tree_mass_per_edge(MassTree const &tree)
Return a std::vector that contains the total mass for each edge of the given MassTree.
double weighted_geometric_mean(ForwardIterator first_value, ForwardIterator last_value, ForwardIterator first_weight, ForwardIterator last_weight)
Calculate the weighted geometric mean of a range of positive numbers.
Definition: statistics.hpp:635
double mean(const Histogram &h)
Compute the bin-weighted arithmetic mean.
MatrixCol< self_type, value_type > col(size_t col)
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
Settings to calculate balances and the Phylogenetic ILR Transform.
Definition: balances.hpp:76
MatrixRow< self_type, value_type > row(size_t row)
double maximum_norm(ForwardIterator first, ForwardIterator last)
Calculate the Maximum norm (infinity norm) of a range of numbers.
Definition: distance.hpp:178
double arithmetic_mean(ForwardIterator first, ForwardIterator last)
Calculate the arithmetic mean of a range of numbers.
Definition: statistics.hpp:437
CommonTree convert_to_common_tree(Tree const &source_tree)
Convert a Tree to a CommonTree with CommonNodeData and CommonEdgeData.
double pseudo_count_summand_all
Set the constant that is added to all taxon masses to avoid zero counts.
Definition: balances.hpp:150
std::vector< double > mass_balance(BalanceData const &data, std::unordered_set< size_t > const &numerator_edge_indices, std::unordered_set< size_t > const &denominator_edge_indices)
Calculate the balance of edge masses between two sets of edges.
Definition: balances.cpp:357
Use the Manhattan norm of the relative abundances of the taxon.
bool identical_topology(Tree const &lhs, Tree const &rhs, bool identical_indices)
Return whether both trees have an identical topology.
Use the arithmetic mean of the taxon masses.
Tree tree
The Tree on which to calculate balances.
Definition: balances.hpp:179
std::vector< double > taxon_weights
The taxon/edge weights calculated from mulitple trees.
Definition: balances.hpp:229
Header of Tree class.
utils::Matrix< double > edge_masses
The relative per-edge masses per original input Tree.
Definition: balances.hpp:197
bool almost_equal_relative(double lhs, double rhs, double max_rel_diff=std::numeric_limits< double >::epsilon())
Check whether two doubles are almost equal, using a relative epsilon to compare them.
Definition: common.hpp:118