A toolkit for working with phylogenetic data.
v0.24.0
tree/mass_tree/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 #include "genesis/tree/tree.hpp"
37 
41 
42 #include <algorithm>
43 #include <cassert>
44 #include <cmath>
45 #include <stdexcept>
46 #include <utility>
47 #include <vector>
48 
49 #ifdef GENESIS_OPENMP
50 # include <omp.h>
51 #endif
52 
53 namespace genesis {
54 namespace tree {
55 
56 // =================================================================================================
57 // Manipulate Masses
58 // =================================================================================================
59 
61  MassTree const& lhs,
62  MassTree const& rhs,
63  double const scaler_lhs,
64  double const scaler_rhs
65 ) {
66  auto copy = lhs;
67  mass_tree_merge_trees_inplace( copy, rhs, scaler_lhs, scaler_rhs );
68  return copy;
69 }
70 
72  MassTree& lhs,
73  MassTree const& rhs,
74  double const scaler_lhs,
75  double const scaler_rhs
76 ) {
77  // Do at least a basic check.
78  if( lhs.edge_count() != rhs.edge_count() ) {
79  throw std::runtime_error( "Incompatible MassTrees for merging." );
80  }
81 
82  // Only do the work if needed.
83  if( scaler_lhs != 1.0 ) {
84  mass_tree_scale_masses( lhs, scaler_lhs );
85  }
86 
87  #pragma omp parallel for
88  for( size_t i = 0; i < lhs.edge_count(); ++i ) {
89  auto& lhs_masses = lhs.edge_at( i ).data<MassTreeEdgeData>().masses;
90  for( auto const& rhs_mass : rhs.edge_at( i ).data<MassTreeEdgeData>().masses ) {
91  lhs_masses[ rhs_mass.first ] += scaler_rhs * rhs_mass.second;
92  }
93  }
94 }
95 
97 {
98  #pragma omp parallel for
99  for( size_t i = 0; i < tree.edge_count(); ++i ) {
100  tree.edge_at(i).data<MassTreeEdgeData>().masses.clear();
101  }
102 }
103 
105 {
106  #pragma omp parallel for
107  for( size_t i = 0; i < tree.edge_count(); ++i ) {
108  auto& masses = tree.edge_at( i ).data<MassTreeEdgeData>().masses;
109  for( auto& mass : masses ) {
110  mass.second = -mass.second;
111  }
112  }
113 }
114 
115 void mass_tree_scale_masses( MassTree& tree, double factor )
116 {
117  #pragma omp parallel for
118  for( size_t i = 0; i < tree.edge_count(); ++i ) {
119  auto& masses = tree.edge_at( i ).data<MassTreeEdgeData>().masses;
120  for( auto& mass : masses ) {
121  mass.second *= factor;
122  }
123  }
124 }
125 
127 {
128  double const total_mass = mass_tree_sum_of_masses( tree );
129 
130  if( total_mass == 0.0 ) {
131  return;
132  }
133 
134  #pragma omp parallel for
135  for( size_t i = 0; i < tree.edge_count(); ++i ) {
136  for( auto& mass : tree.edge_at(i).data<MassTreeEdgeData>().masses ) {
137  mass.second /= total_mass;
138  }
139  }
140 }
141 
143 {
144  #pragma omp parallel for
145  for( size_t i = 0; i < tree.edge_count(); ++i ) {
146  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
147  std::map<double, double> relative;
148 
149  for( auto& mass : edge_data.masses ) {
150  relative[ mass.first / edge_data.branch_length ] += mass.second;
151  }
152 
153  edge_data.masses = relative;
154  edge_data.branch_length = 1.0;
155  }
156 }
157 
159 {
160  double work = 0.0;
161 
162  #pragma omp parallel for
163  for( size_t i = 0; i < tree.edge_count(); ++i ) {
164  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
165 
166  double const branch_center = edge_data.branch_length / 2;
167  double central_mass = 0.0;
168 
169  for( auto const& mass : edge_data.masses ) {
170  #pragma omp atomic
171  work += mass.second * std::abs( branch_center - mass.first );
172  central_mass += mass.second;
173  }
174 
175  edge_data.masses.clear();
176  edge_data.masses[ branch_center ] = central_mass;
177  }
178  return work;
179 }
180 
182 {
183  double work = 0.0;
184 
185  #pragma omp parallel for
186  for( size_t i = 0; i < tree.edge_count(); ++i ) {
187  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
188 
189  // No masses on the edge. We need to skip the rest, otherwise we end up having a nan values
190  // as mass centers, which leads to nan earth mover distance values, which leads to invalid
191  // kmeans cluster centroid assigments, which leads to crashes. What a stupid bug that was.
192  if( edge_data.masses.empty() ) {
193  continue;
194  }
195 
196  double mass_center = 0.0;
197  double mass_sum = 0.0;
198 
199  // Accumulate the mass center by adding the weighted positions,
200  // and accumulate the total sum of weights.
201  for( auto const& mass : edge_data.masses ) {
202  mass_center += mass.first * mass.second;
203  mass_sum += mass.second;
204  }
205 
206  // Find average mass center by dividing by weight sum.
207  mass_center /= mass_sum;
208 
209  // Calculate work.
210  for( auto const& mass : edge_data.masses ) {
211  #pragma omp atomic
212  work += mass.second * std::abs( mass_center - mass.first );
213  }
214 
215  // Set the new mass at the mass center.
216  edge_data.masses.clear();
217  edge_data.masses[ mass_center ] = mass_sum;
218  }
219  return work;
220 }
221 
222 double mass_tree_binify_masses( MassTree& tree, size_t number_of_bins )
223 {
224  if( number_of_bins == 0 ) {
225  throw std::invalid_argument( "Cannot use number_of_bins == 0." );
226  }
227 
228  // Take a number and return the closest bin position. The bins are found as the mid points
229  // of equally sized intervals distributed over the branch length, with number_of_bins being
230  // the number of those intervals/bins.
231  auto get_bin_pos = [ number_of_bins ]( double pos, double bl )
232  {
233  auto const nb = static_cast<double>( number_of_bins );
234 
235  // Trim and scale pos to be in interval [ 0.0, nb )
236  auto const pn = std::min( std::max( 0.0, pos / bl * nb ), std::nextafter( nb, 0.0 ));
237 
238  // Floor it to get to the interval start, then scale back, and add half the interval size,
239  // so that we end up at the mid point of the interval.
240  return ( std::floor( pn ) * bl / nb ) + ( bl / nb / 2.0 );
241  };
242 
243  double work = 0.0;
244 
245  #pragma omp parallel for
246  for( size_t i = 0; i < tree.edge_count(); ++i ) {
247 
248  // Shorthands.
249  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
250  auto new_masses = std::map<double, double>();
251 
252  // Accumulate masses at the closest bins, and accumulate the work needed to do so.
253  for( auto const& mass : edge_data.masses ) {
254  auto const bin = get_bin_pos( mass.first, edge_data.branch_length );
255 
256  work += mass.second * std::abs( bin - mass.first );
257  new_masses[ bin ] += mass.second;
258  }
259 
260  // Replace masses by new accumuated ones.
261  edge_data.masses = new_masses;
262  }
263 
264  return work;
265 }
266 
267 // =================================================================================================
268 // Others
269 // =================================================================================================
270 
271 void mass_trees_make_average_branch_lengths( std::vector<MassTree>& mass_trees )
272 {
273  // Nothing to do.
274  if( mass_trees.size() < 2 ) {
275  return;
276  }
277 
278  // Store averages
279  size_t const num_edges = mass_trees[0].edge_count();
280  auto avg_br_lens = std::vector<double>( num_edges, 0.0 );
281 
282  // Accumulate averages
283  for( auto const& tree : mass_trees ) {
284 
285  // Check
286  if( tree.edge_count() != num_edges ) {
287  throw std::runtime_error(
288  "Cannot make average branch lengths, because trees have different size."
289  );
290  }
291 
292  // Accu
293  assert( avg_br_lens.size() == tree.edge_count() );
294  for( size_t edge_idx = 0; edge_idx < num_edges; ++edge_idx ) {
295  avg_br_lens[edge_idx] += tree.edge_at( edge_idx ).data<MassTreeEdgeData>().branch_length;
296  }
297  }
298 
299  // Average
300  for( auto& ae : avg_br_lens ) {
301  ae /= static_cast<double>( mass_trees.size() );
302  }
303 
304  // Set branch lengths and ajust masses.
305  for( auto& tree : mass_trees ) {
306 
307  #pragma omp parallel for
308  for( size_t edge_idx = 0; edge_idx < tree.edge_count(); ++edge_idx ) {
309 
310  // Setup.
311  auto& edge_data = tree.edge_at( edge_idx ).data<MassTreeEdgeData>();
312  auto new_masses = std::map<double, double>();
313 
314  // Branch position scaler.
315  auto const scaler = avg_br_lens[ edge_idx ] / edge_data.branch_length;
316 
317  // Move masses proprotional to branch lengths change.
318  for( auto const& mass : edge_data.masses ) {
319 
320  auto const new_pos = mass.first * scaler;
321  new_masses[ new_pos ] += mass.second;
322  }
323 
324  // Replace masses by new accumuated ones, and change br len
325  edge_data.masses = new_masses;
326  edge_data.branch_length = avg_br_lens[ edge_idx ];
327  }
328  }
329 }
330 
331 std::vector<double> mass_tree_mass_per_edge( MassTree const& tree )
332 {
333  auto result = std::vector<double>( tree.edge_count(), 0.0 );
334 
335  #pragma omp parallel for
336  for( size_t i = 0; i < tree.edge_count(); ++i ) {
337  auto const& edge = tree.edge_at(i);
338  assert( i == edge.index() );
339 
340  double sum = 0.0;
341  for( auto const& mass : edge.data<MassTreeEdgeData>().masses ) {
342  sum += mass.second;
343  }
344  result[ i ] = sum;
345  }
346 
347  return result;
348 }
349 
350 utils::Matrix<double> mass_tree_mass_per_edge( std::vector<MassTree> const& mass_trees )
351 {
352  if( mass_trees.empty() || mass_trees[0].empty() ) {
353  return {};
354  }
355 
356  utils::Matrix<double> result{ mass_trees.size(), mass_trees[0].edge_count(), 0.0 };
357 
358  #pragma omp parallel for
359  for( size_t i = 0; i < mass_trees.size(); ++i ) {
360  if( mass_trees[i].edge_count() != result.cols() ) {
361  throw std::runtime_error(
362  "Cannot calculate masses per edge for a Tree set with Trees "
363  "with unequal edge count."
364  );
365  }
366 
367  result.row( i ) = mass_tree_mass_per_edge( mass_trees[i] );
368  }
369 
370  return result;
371 }
372 
373 std::vector<std::pair<double, double>> mass_tree_mass_per_edge_averaged( MassTree const& tree )
374 {
375  // First value: position. Second value: mass at that position.
376  auto result = std::vector<std::pair<double, double>>( tree.edge_count(), { 0.0, 0.0 });
377 
378  #pragma omp parallel for
379  for( size_t i = 0; i < tree.edge_count(); ++i ) {
380  auto const& edge = tree.edge_at(i);
381  auto const& edge_data = edge.data<MassTreeEdgeData>();
382 
383  // No masses on the edge. We need to skip the rest, otherwise we end up having a nan values.
384  if( edge_data.masses.empty() ) {
385  continue;
386  }
387 
388  // Add up masses and positions.
389  double mass_pos = 0.0;
390  double mass_sum = 0.0;
391  for( auto const& mass : edge_data.masses ) {
392  mass_pos += mass.first * mass.second;
393  mass_sum += mass.second;
394  }
395 
396  // Find average mass center by dividing by total mass.
397  mass_pos /= mass_sum;
398 
399  result[ edge.index() ].first = mass_pos;
400  result[ edge.index() ].second = mass_sum;
401  }
402 
403  return result;
404 }
405 
406 double mass_tree_sum_of_masses( MassTree const& tree )
407 {
408  double total_mass = 0.0;
409  for( auto const& edge : tree.edges() ) {
410  for( auto const& mass : edge.data<MassTreeEdgeData>().masses ) {
411  total_mass += mass.second;
412  }
413  }
414  return total_mass;
415 }
416 
417 bool mass_tree_validate( MassTree const& tree, double valid_total_mass_difference )
418 {
419  // Check tree.
420  if( ! validate_topology( tree ) ) {
421  LOG_INFO << "Invalid Tree topology.";
422  return false;
423  }
424  if( ! tree_data_is< MassTreeNodeData, MassTreeEdgeData >( tree )) {
425  LOG_INFO << "Tree does not only contain Mass Node and Edge data types.";
426  return false;
427  }
428 
429  // Check masses.
430  double mass_sum = 0.0;
431  for( auto const& edge : tree.edges() ) {
432  auto const edge_data = dynamic_cast< MassTreeEdgeData const* >( edge.data_ptr() );
433  if( edge_data == nullptr ) {
434  LOG_INFO << "Edge data type is not 'MassTreeEdgeData'.";
435  return false;
436  }
437 
438  for( auto const& mass : edge_data->masses ) {
439  if( mass.first < 0.0 ) {
440  LOG_INFO << "Mass with branch position < 0.0";
441  return false;
442  }
443  if( mass.first > edge_data->branch_length ) {
444  LOG_INFO << "Mass with branch position > branch_length";
445  return false;
446  }
447 
448  mass_sum += mass.second;
449  }
450  }
451 
452  if( valid_total_mass_difference >= 0.0 && std::abs(mass_sum) > valid_total_mass_difference ) {
453  LOG_INFO << "Total mass difference " << mass_sum
454  << " is higher than " << valid_total_mass_difference;
455  return false;
456  }
457  return true;
458 }
459 
460 } // namespace tree
461 } // namespace genesis
void mass_tree_reverse_signs(MassTree &tree)
Reverse the sign of each mass point on a MassTree.
Tree operator functions.
Data class for MassTreeEdges. Stores the branch length and a list of masses with their positions alon...
double mass_tree_center_masses_on_branches_averaged(MassTree &tree)
Accumulate all masses of a MassTree at the average mass position per edge.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
double sum(const Histogram &h)
double mass_tree_sum_of_masses(MassTree const &tree)
Return the total sum of all masses on the MassTree.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
bool mass_tree_validate(MassTree const &tree, double valid_total_mass_difference)
Validate the data on a MassTree.
Matrix operators.
std::vector< double > mass_tree_mass_per_edge(MassTree const &tree)
Return a std::vector that contains the total mass for each edge of the given MassTree.
std::map< double, double > masses
List of masses stored on this branch, sorted by their position on the branch.
size_t edge_count(Tree const &tree)
Return the number of Edges of a Tree. Same as Tree::edge_count().
void mass_tree_transform_to_unit_branch_lengths(MassTree &tree)
Set all branch lengths of a MassTree to 1.0, while keeping the relative position of all masses on the...
std::vector< std::pair< double, double > > mass_tree_mass_per_edge_averaged(MassTree const &tree)
Return a std::vector that contains the total Mass for each edge of the given MassTree (second value)...
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
void mass_tree_merge_trees_inplace(MassTree &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees by adding them to the first MassTree.
utils::Range< IteratorEdges > edges()
Definition: tree/tree.hpp:452
void mass_trees_make_average_branch_lengths(std::vector< MassTree > &mass_trees)
Change the branch lengths of all trees to their average, and move the masses accordingly in a proport...
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
Provides easy and fast logging functionality.
void mass_tree_clear_masses(MassTree &tree)
Clear all masses of a MassTree, while keeping its topology.
double branch_length
Branch length of the edge.
bool validate_topology(Tree const &tree)
Validate that all internal pointers of the Tree elements (TreeLinks, TreeNodes, TreeEdges) to each ot...
void mass_tree_scale_masses(MassTree &tree, double factor)
Scale all masses of a MassTree with the multiplicative factor factor.
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
Header of Tree class.
double mass_tree_center_masses_on_branches(MassTree &tree)
Accumulate all masses of a MassTree on the centers of their edges.
EdgeDataType & data()
Definition: edge.hpp:217
double mass_tree_binify_masses(MassTree &tree, size_t number_of_bins)
Accumulate all masses of a MassTree into bins on each branch.
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:99