A library for working with phylogenetic and population genetic data.
v0.32.0
tree/mass_tree/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2023 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
36 #include "genesis/tree/tree.hpp"
37 
41 
42 #include <algorithm>
43 #include <cassert>
44 #include <cmath>
45 #include <stdexcept>
46 #include <utility>
47 #include <vector>
48 
49 #ifdef GENESIS_OPENMP
50 # include <omp.h>
51 #endif
52 
53 namespace genesis {
54 namespace tree {
55 
56 // =================================================================================================
57 // Manipulate Masses
58 // =================================================================================================
59 
61  MassTree const& lhs,
62  MassTree const& rhs,
63  double const scaler_lhs,
64  double const scaler_rhs
65 ) {
66  auto copy = lhs;
67  mass_tree_merge_trees_inplace( copy, rhs, scaler_lhs, scaler_rhs );
68  return copy;
69 }
70 
72  MassTree& lhs,
73  MassTree const& rhs,
74  double const scaler_lhs,
75  double const scaler_rhs
76 ) {
77  // Do at least a basic check.
78  if( lhs.edge_count() != rhs.edge_count() ) {
79  throw std::runtime_error( "Incompatible MassTrees for merging." );
80  }
81 
82  // Only do the work if needed.
83  if( scaler_lhs != 1.0 ) {
84  mass_tree_scale_masses( lhs, scaler_lhs );
85  }
86 
87  #pragma omp parallel for
88  for( size_t i = 0; i < lhs.edge_count(); ++i ) {
89  auto& lhs_masses = lhs.edge_at( i ).data<MassTreeEdgeData>().masses;
90  for( auto const& rhs_mass : rhs.edge_at( i ).data<MassTreeEdgeData>().masses ) {
91  lhs_masses[ rhs_mass.first ] += scaler_rhs * rhs_mass.second;
92  }
93  }
94 }
95 
97 {
98  #pragma omp parallel for
99  for( size_t i = 0; i < tree.edge_count(); ++i ) {
100  tree.edge_at(i).data<MassTreeEdgeData>().masses.clear();
101  }
102 }
103 
105 {
106  #pragma omp parallel for
107  for( size_t i = 0; i < tree.edge_count(); ++i ) {
108  auto& masses = tree.edge_at( i ).data<MassTreeEdgeData>().masses;
109  for( auto& mass : masses ) {
110  mass.second = -mass.second;
111  }
112  }
113 }
114 
115 void mass_tree_scale_masses( MassTree& tree, double factor )
116 {
117  #pragma omp parallel for
118  for( size_t i = 0; i < tree.edge_count(); ++i ) {
119  auto& masses = tree.edge_at( i ).data<MassTreeEdgeData>().masses;
120  for( auto& mass : masses ) {
121  mass.second *= factor;
122  }
123  }
124 }
125 
127 {
128  double const total_mass = mass_tree_sum_of_masses( tree );
129 
130  if( total_mass == 0.0 ) {
131  return;
132  }
133 
134  #pragma omp parallel for
135  for( size_t i = 0; i < tree.edge_count(); ++i ) {
136  for( auto& mass : tree.edge_at(i).data<MassTreeEdgeData>().masses ) {
137  mass.second /= total_mass;
138  }
139  }
140 }
141 
143 {
144  #pragma omp parallel for
145  for( size_t i = 0; i < tree.edge_count(); ++i ) {
146  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
147  std::map<double, double> relative;
148 
149  for( auto& mass : edge_data.masses ) {
150  relative[ mass.first / edge_data.branch_length ] += mass.second;
151  }
152 
153  edge_data.masses = relative;
154  edge_data.branch_length = 1.0;
155  }
156 }
157 
159 {
160  double work = 0.0;
161 
162  #pragma omp parallel for
163  for( size_t i = 0; i < tree.edge_count(); ++i ) {
164  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
165 
166  double const branch_center = edge_data.branch_length / 2;
167  double central_mass = 0.0;
168 
169  for( auto const& mass : edge_data.masses ) {
170  #pragma omp atomic
171  work += mass.second * std::abs( branch_center - mass.first );
172  central_mass += mass.second;
173  }
174 
175  edge_data.masses.clear();
176  edge_data.masses[ branch_center ] = central_mass;
177  }
178  return work;
179 }
180 
182 {
183  double work = 0.0;
184 
185  #pragma omp parallel for
186  for( size_t i = 0; i < tree.edge_count(); ++i ) {
187  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
188 
189  // No masses on the edge. We need to skip the rest, otherwise we end up having a nan values
190  // as mass centers, which leads to nan earth mover distance values, which leads to invalid
191  // kmeans cluster centroid assignments, which leads to crashes. What a stupid bug that was.
192  if( edge_data.masses.empty() ) {
193  continue;
194  }
195 
196  double mass_center = 0.0;
197  double mass_sum = 0.0;
198 
199  // Accumulate the mass center by adding the weighted positions,
200  // and accumulate the total sum of weights.
201  for( auto const& mass : edge_data.masses ) {
202  mass_center += mass.first * mass.second;
203  mass_sum += mass.second;
204  }
205 
206  // Find average mass center by dividing by weight sum.
207  mass_center /= mass_sum;
208 
209  // Calculate work.
210  for( auto const& mass : edge_data.masses ) {
211  #pragma omp atomic
212  work += mass.second * std::abs( mass_center - mass.first );
213  }
214 
215  // Set the new mass at the mass center.
216  edge_data.masses.clear();
217  edge_data.masses[ mass_center ] = mass_sum;
218  }
219  return work;
220 }
221 
222 double mass_tree_binify_masses( MassTree& tree, size_t number_of_bins )
223 {
224  if( number_of_bins == 0 ) {
225  throw std::invalid_argument( "Cannot use number_of_bins == 0." );
226  }
227 
228  // Take a number and return the closest bin position. The bins are found as the mid points
229  // of equally sized intervals distributed over the branch length, with number_of_bins being
230  // the number of those intervals/bins.
231  auto get_bin_pos = [ number_of_bins ]( double pos, double bl )
232  {
233  auto const nb = static_cast<double>( number_of_bins );
234 
235  // Trim and scale pos to be in interval [ 0.0, nb )
236  auto const pn = std::min( std::max( 0.0, pos / bl * nb ), std::nextafter( nb, 0.0 ));
237 
238  // Floor it to get to the interval start, then scale back, and add half the interval size,
239  // so that we end up at the mid point of the interval.
240  return ( std::floor( pn ) * bl / nb ) + ( bl / nb / 2.0 );
241  };
242 
243  double work = 0.0;
244 
245  #pragma omp parallel for
246  for( size_t i = 0; i < tree.edge_count(); ++i ) {
247 
248  // Shorthands.
249  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
250  auto new_masses = std::map<double, double>();
251 
252  // Accumulate masses at the closest bins, and accumulate the work needed to do so.
253  for( auto const& mass : edge_data.masses ) {
254  auto const bin = get_bin_pos( mass.first, edge_data.branch_length );
255 
256  work += mass.second * std::abs( bin - mass.first );
257  new_masses[ bin ] += mass.second;
258  }
259 
260  // Replace masses by new accumuated ones.
261  edge_data.masses = new_masses;
262  }
263 
264  return work;
265 }
266 
267 // =================================================================================================
268 // Others
269 // =================================================================================================
270 
271 void mass_trees_make_average_branch_lengths( std::vector<MassTree>& mass_trees )
272 {
273  // Nothing to do.
274  if( mass_trees.size() < 2 ) {
275  return;
276  }
277 
278  // Store averages
279  size_t const num_edges = mass_trees[0].edge_count();
280  auto avg_br_lens = std::vector<double>( num_edges, 0.0 );
281 
282  // Accumulate averages
283  for( auto const& tree : mass_trees ) {
284 
285  // Check
286  if( tree.edge_count() != num_edges ) {
287  throw std::runtime_error(
288  "Cannot make average branch lengths, because trees have different size."
289  );
290  }
291 
292  // Accu
293  assert( avg_br_lens.size() == tree.edge_count() );
294  for( size_t edge_idx = 0; edge_idx < num_edges; ++edge_idx ) {
295  avg_br_lens[edge_idx] += tree.edge_at( edge_idx ).data<MassTreeEdgeData>().branch_length;
296  }
297  }
298 
299  // Average
300  for( auto& ae : avg_br_lens ) {
301  ae /= static_cast<double>( mass_trees.size() );
302  }
303 
304  // Set branch lengths and ajust masses.
305  for( auto& tree : mass_trees ) {
306 
307  #pragma omp parallel for
308  for( size_t edge_idx = 0; edge_idx < tree.edge_count(); ++edge_idx ) {
309 
310  // Setup.
311  auto& edge_data = tree.edge_at( edge_idx ).data<MassTreeEdgeData>();
312  auto new_masses = std::map<double, double>();
313 
314  // Branch position scaler. Need to take care of zero branch lengths.
315  assert( std::isfinite( avg_br_lens[ edge_idx ] ));
316  assert( std::isfinite( edge_data.branch_length ));
317  auto scaler = avg_br_lens[ edge_idx ] / edge_data.branch_length;
318  if( ! std::isfinite( scaler )) {
319  assert( edge_data.branch_length == 0.0 );
320  scaler = 1.0;
321  }
322 
323  // Move masses proprotional to branch lengths change.
324  for( auto const& mass : edge_data.masses ) {
325 
326  auto const new_pos = mass.first * scaler;
327  new_masses[ new_pos ] += mass.second;
328  }
329 
330  // Replace masses by new accumuated ones, and change br len
331  edge_data.masses = new_masses;
332  edge_data.branch_length = avg_br_lens[ edge_idx ];
333  }
334  }
335 }
336 
337 std::vector<double> mass_tree_mass_per_edge( MassTree const& tree )
338 {
339  auto result = std::vector<double>( tree.edge_count(), 0.0 );
340 
341  #pragma omp parallel for
342  for( size_t i = 0; i < tree.edge_count(); ++i ) {
343  auto const& edge = tree.edge_at(i);
344  assert( i == edge.index() );
345 
346  double sum = 0.0;
347  for( auto const& mass : edge.data<MassTreeEdgeData>().masses ) {
348  sum += mass.second;
349  }
350  result[ i ] = sum;
351  }
352 
353  return result;
354 }
355 
356 utils::Matrix<double> mass_tree_mass_per_edge( std::vector<MassTree> const& mass_trees )
357 {
358  if( mass_trees.empty() || mass_trees[0].empty() ) {
359  return {};
360  }
361 
362  utils::Matrix<double> result{ mass_trees.size(), mass_trees[0].edge_count(), 0.0 };
363 
364  #pragma omp parallel for
365  for( size_t i = 0; i < mass_trees.size(); ++i ) {
366  if( mass_trees[i].edge_count() != result.cols() ) {
367  throw std::runtime_error(
368  "Cannot calculate masses per edge for a Tree set with Trees "
369  "with unequal edge count."
370  );
371  }
372 
373  result.row( i ) = mass_tree_mass_per_edge( mass_trees[i] );
374  }
375 
376  return result;
377 }
378 
379 std::vector<std::pair<double, double>> mass_tree_mass_per_edge_averaged( MassTree const& tree )
380 {
381  // First value: position. Second value: mass at that position.
382  auto result = std::vector<std::pair<double, double>>( tree.edge_count(), { 0.0, 0.0 });
383 
384  #pragma omp parallel for
385  for( size_t i = 0; i < tree.edge_count(); ++i ) {
386  auto const& edge = tree.edge_at(i);
387  auto const& edge_data = edge.data<MassTreeEdgeData>();
388 
389  // No masses on the edge. We need to skip the rest, otherwise we end up having a nan values.
390  if( edge_data.masses.empty() ) {
391  continue;
392  }
393 
394  // Add up masses and positions.
395  double mass_pos = 0.0;
396  double mass_sum = 0.0;
397  for( auto const& mass : edge_data.masses ) {
398  mass_pos += mass.first * mass.second;
399  mass_sum += mass.second;
400  }
401 
402  // Find average mass center by dividing by total mass.
403  mass_pos /= mass_sum;
404 
405  result[ edge.index() ].first = mass_pos;
406  result[ edge.index() ].second = mass_sum;
407  }
408 
409  return result;
410 }
411 
412 double mass_tree_sum_of_masses( MassTree const& tree )
413 {
414  double total_mass = 0.0;
415  for( auto const& edge : tree.edges() ) {
416  for( auto const& mass : edge.data<MassTreeEdgeData>().masses ) {
417  total_mass += mass.second;
418  }
419  }
420  return total_mass;
421 }
422 
423 bool mass_tree_validate( MassTree const& tree, double valid_total_mass_difference )
424 {
425  // Check tree.
426  if( ! validate_topology( tree ) ) {
427  LOG_INFO << "Invalid Tree topology.";
428  return false;
429  }
430  if( ! tree_data_is< MassTreeNodeData, MassTreeEdgeData >( tree )) {
431  LOG_INFO << "Tree does not only contain Mass Node and Edge data types.";
432  return false;
433  }
434 
435  // Check masses.
436  double mass_sum = 0.0;
437  for( auto const& edge : tree.edges() ) {
438  auto const edge_data = dynamic_cast< MassTreeEdgeData const* >( edge.data_ptr() );
439  if( edge_data == nullptr ) {
440  LOG_INFO << "Edge data type is not 'MassTreeEdgeData'.";
441  return false;
442  }
443 
444  for( auto const& mass : edge_data->masses ) {
445  if( mass.first < 0.0 ) {
446  LOG_INFO << "Mass with branch position < 0.0";
447  return false;
448  }
449  if( mass.first > edge_data->branch_length ) {
450  LOG_INFO << "Mass with branch position > branch_length";
451  return false;
452  }
453 
454  mass_sum += mass.second;
455  }
456  }
457 
458  if( valid_total_mass_difference >= 0.0 && std::abs(mass_sum) > valid_total_mass_difference ) {
459  LOG_INFO << "Total mass difference " << mass_sum
460  << " is higher than " << valid_total_mass_difference;
461  return false;
462  }
463  return true;
464 }
465 
466 } // namespace tree
467 } // namespace genesis
LOG_INFO
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:100
genesis::tree::MassTreeEdgeData::masses
std::map< double, double > masses
List of masses stored on this branch, sorted by their position on the branch.
Definition: tree/mass_tree/tree.hpp:199
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::tree::mass_tree_merge_trees
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
Definition: tree/mass_tree/functions.cpp:60
genesis::tree::mass_tree_center_masses_on_branches_averaged
double mass_tree_center_masses_on_branches_averaged(MassTree &tree)
Accumulate all masses of a MassTree at the average mass position per edge.
Definition: tree/mass_tree/functions.cpp:181
genesis::tree::MassTreeEdgeData
Data class for MassTreeEdges. Stores the branch length and a list of masses with their positions alon...
Definition: tree/mass_tree/tree.hpp:148
genesis::tree::mass_tree_validate
bool mass_tree_validate(MassTree const &tree, double valid_total_mass_difference)
Validate the data on a MassTree.
Definition: tree/mass_tree/functions.cpp:423
genesis::tree::mass_tree_transform_to_unit_branch_lengths
void mass_tree_transform_to_unit_branch_lengths(MassTree &tree)
Set all branch lengths of a MassTree to 1.0, while keeping the relative position of all masses on the...
Definition: tree/mass_tree/functions.cpp:142
tree.hpp
Header of Tree class.
genesis::tree::Tree::edge_at
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
genesis::tree::mass_tree_reverse_signs
void mass_tree_reverse_signs(MassTree &tree)
Reverse the sign of each mass point on a MassTree.
Definition: tree/mass_tree/functions.cpp:104
genesis::tree::mass_tree_binify_masses
double mass_tree_binify_masses(MassTree &tree, size_t number_of_bins)
Accumulate all masses of a MassTree into bins on each branch.
Definition: tree/mass_tree/functions.cpp:222
genesis::utils::Matrix< double >
genesis::tree::mass_tree_scale_masses
void mass_tree_scale_masses(MassTree &tree, double factor)
Scale all masses of a MassTree with the multiplicative factor factor.
Definition: tree/mass_tree/functions.cpp:115
tree.hpp
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
logging.hpp
Provides easy and fast logging functionality.
genesis::tree::CommonEdgeData::branch_length
double branch_length
Branch length of the edge.
Definition: tree/common_tree/tree.hpp:193
matrix.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
operators.hpp
Tree operator functions.
genesis::tree::mass_tree_merge_trees_inplace
void mass_tree_merge_trees_inplace(MassTree &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees by adding them to the first MassTree.
Definition: tree/mass_tree/functions.cpp:71
genesis::tree::Tree::edges
utils::Range< IteratorEdges > edges()
Definition: tree/tree.hpp:452
functions.hpp
genesis::tree::mass_tree_center_masses_on_branches
double mass_tree_center_masses_on_branches(MassTree &tree)
Accumulate all masses of a MassTree on the centers of their edges.
Definition: tree/mass_tree/functions.cpp:158
operators.hpp
Matrix operators.
genesis::tree::mass_tree_mass_per_edge
std::vector< double > mass_tree_mass_per_edge(MassTree const &tree)
Return a std::vector that contains the total mass for each edge of the given MassTree.
Definition: tree/mass_tree/functions.cpp:337
genesis::tree::mass_tree_sum_of_masses
double mass_tree_sum_of_masses(MassTree const &tree)
Return the total sum of all masses on the MassTree.
Definition: tree/mass_tree/functions.cpp:412
genesis::tree::mass_tree_normalize_masses
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
Definition: tree/mass_tree/functions.cpp:126
postorder.hpp
genesis::tree::TreeEdge::data
EdgeDataType & data()
Definition: edge.hpp:217
genesis::tree::mass_trees_make_average_branch_lengths
void mass_trees_make_average_branch_lengths(std::vector< MassTree > &mass_trees)
Change the branch lengths of all trees to their average, and move the masses accordingly in a proport...
Definition: tree/mass_tree/functions.cpp:271
genesis::tree::validate_topology
bool validate_topology(Tree const &tree)
Validate that all internal pointers of the Tree elements (TreeLinks, TreeNodes, TreeEdges) to each ot...
Definition: tree/function/operators.cpp:332
genesis::tree::edge_count
size_t edge_count(Tree const &tree)
Return the number of Edges of a Tree. Same as Tree::edge_count().
Definition: tree/function/functions.cpp:212
genesis::tree::mass_tree_mass_per_edge_averaged
std::vector< std::pair< double, double > > mass_tree_mass_per_edge_averaged(MassTree const &tree)
Return a std::vector that contains the total Mass for each edge of the given MassTree (second value),...
Definition: tree/mass_tree/functions.cpp:379
genesis::tree::mass_tree_clear_masses
void mass_tree_clear_masses(MassTree &tree)
Clear all masses of a MassTree, while keeping its topology.
Definition: tree/mass_tree/functions.cpp:96
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272