A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/mass_tree/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 #include "genesis/tree/tree.hpp"
37 
41 
42 #include <algorithm>
43 #include <cassert>
44 #include <cmath>
45 #include <stdexcept>
46 #include <utility>
47 #include <vector>
48 
49 #ifdef GENESIS_OPENMP
50 # include <omp.h>
51 #endif
52 
53 namespace genesis {
54 namespace tree {
55 
56 // =================================================================================================
57 // Manipulate Masses
58 // =================================================================================================
59 
61  MassTree const& lhs,
62  MassTree const& rhs,
63  double const scaler_lhs,
64  double const scaler_rhs
65 ) {
66  auto copy = lhs;
67  mass_tree_merge_trees_inplace( copy, rhs, scaler_lhs, scaler_rhs );
68  return copy;
69 }
70 
72  MassTree& lhs,
73  MassTree const& rhs,
74  double const scaler_lhs,
75  double const scaler_rhs
76 ) {
77  // Do at least a basic check.
78  if( lhs.edge_count() != rhs.edge_count() ) {
79  throw std::runtime_error( "Incompatible MassTrees for merging." );
80  }
81 
82  // Only do the work if needed.
83  if( scaler_lhs != 1.0 ) {
84  mass_tree_scale_masses( lhs, scaler_lhs );
85  }
86 
87  #pragma omp parallel for
88  for( size_t i = 0; i < lhs.edge_count(); ++i ) {
89  auto& lhs_masses = lhs.edge_at( i ).data<MassTreeEdgeData>().masses;
90  for( auto const& rhs_mass : rhs.edge_at( i ).data<MassTreeEdgeData>().masses ) {
91  lhs_masses[ rhs_mass.first ] += scaler_rhs * rhs_mass.second;
92  }
93  }
94 }
95 
97 {
98  #pragma omp parallel for
99  for( size_t i = 0; i < tree.edge_count(); ++i ) {
100  tree.edge_at(i).data<MassTreeEdgeData>().masses.clear();
101  }
102 }
103 
105 {
106  #pragma omp parallel for
107  for( size_t i = 0; i < tree.edge_count(); ++i ) {
108  auto& masses = tree.edge_at( i ).data<MassTreeEdgeData>().masses;
109  for( auto& mass : masses ) {
110  mass.second = -mass.second;
111  }
112  }
113 }
114 
115 void mass_tree_scale_masses( MassTree& tree, double factor )
116 {
117  #pragma omp parallel for
118  for( size_t i = 0; i < tree.edge_count(); ++i ) {
119  auto& masses = tree.edge_at( i ).data<MassTreeEdgeData>().masses;
120  for( auto& mass : masses ) {
121  mass.second *= factor;
122  }
123  }
124 }
125 
127 {
128  double const total_mass = mass_tree_sum_of_masses( tree );
129 
130  if( total_mass == 0.0 ) {
131  return;
132  }
133 
134  #pragma omp parallel for
135  for( size_t i = 0; i < tree.edge_count(); ++i ) {
136  for( auto& mass : tree.edge_at(i).data<MassTreeEdgeData>().masses ) {
137  mass.second /= total_mass;
138  }
139  }
140 }
141 
143 {
144  #pragma omp parallel for
145  for( size_t i = 0; i < tree.edge_count(); ++i ) {
146  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
147  std::map<double, double> relative;
148 
149  for( auto& mass : edge_data.masses ) {
150  relative[ mass.first / edge_data.branch_length ] += mass.second;
151  }
152 
153  edge_data.masses = relative;
154  edge_data.branch_length = 1.0;
155  }
156 }
157 
159 {
160  double work = 0.0;
161 
162  #pragma omp parallel for
163  for( size_t i = 0; i < tree.edge_count(); ++i ) {
164  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
165 
166  double const branch_center = edge_data.branch_length / 2;
167  double central_mass = 0.0;
168 
169  for( auto const& mass : edge_data.masses ) {
170  #pragma omp atomic
171  work += mass.second * std::abs( branch_center - mass.first );
172  central_mass += mass.second;
173  }
174 
175  edge_data.masses.clear();
176  edge_data.masses[ branch_center ] = central_mass;
177  }
178  return work;
179 }
180 
182 {
183  double work = 0.0;
184 
185  #pragma omp parallel for
186  for( size_t i = 0; i < tree.edge_count(); ++i ) {
187  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
188 
189  // No masses on the edge. We need to skip the rest, otherwise we end up having a nan values
190  // as mass centers, which leads to nan earth mover distance values, which leads to invalid
191  // kmeans cluster centroid assigments, which leads to crashes. What a stupid bug that was.
192  if( edge_data.masses.empty() ) {
193  continue;
194  }
195 
196  double mass_center = 0.0;
197  double mass_sum = 0.0;
198 
199  // Accumulate the mass center by adding the weighted positions,
200  // and accumulate the total sum of weights.
201  for( auto const& mass : edge_data.masses ) {
202  mass_center += mass.first * mass.second;
203  mass_sum += mass.second;
204  }
205 
206  // Find average mass center by dividing by weight sum.
207  mass_center /= mass_sum;
208 
209  // Calculate work.
210  for( auto const& mass : edge_data.masses ) {
211  #pragma omp atomic
212  work += mass.second * std::abs( mass_center - mass.first );
213  }
214 
215  // Set the new mass at the mass center.
216  edge_data.masses.clear();
217  edge_data.masses[ mass_center ] = mass_sum;
218  }
219  return work;
220 }
221 
222 double mass_tree_binify_masses( MassTree& tree, size_t number_of_bins )
223 {
224  if( number_of_bins == 0 ) {
225  throw std::invalid_argument( "Cannot use number_of_bins == 0." );
226  }
227 
228  // Take a number and return the closest bin position. The bins are found as the mid points
229  // of equally sized intervals distributed over the branch length, with number_of_bins being
230  // the number of those intervals/bins.
231  auto get_bin_pos = [ number_of_bins ]( double pos, double bl )
232  {
233  auto const nb = static_cast<double>( number_of_bins );
234 
235  // Trim and scale pos to be in interval [ 0.0, nb )
236  auto const pn = std::min( std::max( 0.0, pos / bl * nb ), std::nextafter( nb, 0.0 ));
237 
238  // Floor it to get to the interval start, then scale back, and add half the interval size,
239  // so that we end up at the mid point of the interval.
240  return ( std::floor( pn ) * bl / nb ) + ( bl / nb / 2.0 );
241  };
242 
243  double work = 0.0;
244 
245  #pragma omp parallel for
246  for( size_t i = 0; i < tree.edge_count(); ++i ) {
247 
248  // Shorthands.
249  auto& edge_data = tree.edge_at(i).data<MassTreeEdgeData>();
250  auto new_masses = std::map<double, double>();
251 
252  // Accumulate masses at the closest bins, and accumulate the work needed to do so.
253  for( auto const& mass : edge_data.masses ) {
254  auto const bin = get_bin_pos( mass.first, edge_data.branch_length );
255 
256  work += mass.second * std::abs( bin - mass.first );
257  new_masses[ bin ] += mass.second;
258  }
259 
260  // Replace masses by new accumuated ones.
261  edge_data.masses = new_masses;
262  }
263 
264  return work;
265 }
266 
267 // =================================================================================================
268 // Others
269 // =================================================================================================
270 
271 bool mass_tree_all_identical_topology( std::vector<MassTree> const& mass_trees )
272 {
273  // If all pairs of two adjacent trees have same the topology, all of them have.
274  // Thus, we do not need a complete pairwise comparision.
275  for (size_t i = 1; i < mass_trees.size(); i++) {
276  if( ! identical_topology( mass_trees[i-1], mass_trees[i] )) {
277  return false;
278  }
279  }
280  return true;
281 }
282 
283 void mass_trees_make_average_branch_lengths( std::vector<MassTree>& mass_trees )
284 {
285  // Nothing to do.
286  if( mass_trees.size() < 2 ) {
287  return;
288  }
289 
290  // Store averages
291  size_t const num_edges = mass_trees[0].edge_count();
292  auto avg_br_lens = std::vector<double>( num_edges, 0.0 );
293 
294  // Accumulate averages
295  for( auto const& tree : mass_trees ) {
296 
297  // Check
298  if( tree.edge_count() != num_edges ) {
299  throw std::runtime_error(
300  "Cannot make average branch lengths, because trees have different size."
301  );
302  }
303 
304  // Accu
305  assert( avg_br_lens.size() == tree.edge_count() );
306  for( size_t edge_idx = 0; edge_idx < num_edges; ++edge_idx ) {
307  avg_br_lens[edge_idx] += tree.edge_at( edge_idx ).data<MassTreeEdgeData>().branch_length;
308  }
309  }
310 
311  // Average
312  for( auto& ae : avg_br_lens ) {
313  ae /= static_cast<double>( mass_trees.size() );
314  }
315 
316  // Set branch lengths and ajust masses.
317  for( auto& tree : mass_trees ) {
318 
319  #pragma omp parallel for
320  for( size_t edge_idx = 0; edge_idx < tree.edge_count(); ++edge_idx ) {
321 
322  // Setup.
323  auto& edge_data = tree.edge_at( edge_idx ).data<MassTreeEdgeData>();
324  auto new_masses = std::map<double, double>();
325 
326  // Branch position scaler.
327  auto const scaler = avg_br_lens[ edge_idx ] / edge_data.branch_length;
328 
329  // Move masses proprotional to branch lengths change.
330  for( auto const& mass : edge_data.masses ) {
331 
332  auto const new_pos = mass.first * scaler;
333  new_masses[ new_pos ] += mass.second;
334  }
335 
336  // Replace masses by new accumuated ones, and change br len
337  edge_data.masses = new_masses;
338  edge_data.branch_length = avg_br_lens[ edge_idx ];
339  }
340  }
341 }
342 
343 std::vector<double> mass_tree_mass_per_edge( MassTree const& tree )
344 {
345  auto result = std::vector<double>( tree.edge_count(), 0.0 );
346 
347  #pragma omp parallel for
348  for( size_t i = 0; i < tree.edge_count(); ++i ) {
349  auto const& edge = tree.edge_at(i);
350  auto const& idx = edge.index();
351  for( auto const& mass : edge.data<MassTreeEdgeData>().masses ) {
352  result[ idx ] += mass.second;
353  }
354  }
355 
356  return result;
357 }
358 
359 double mass_tree_sum_of_masses( MassTree const& tree )
360 {
361  double total_mass = 0.0;
362  for( auto const& edge : tree.edges() ) {
363  for( auto const& mass : edge->data<MassTreeEdgeData>().masses ) {
364  total_mass += mass.second;
365  }
366  }
367  return total_mass;
368 }
369 
370 bool mass_tree_validate( MassTree const& tree, double valid_total_mass_difference )
371 {
372  // Check tree.
373  if( ! validate_topology( tree ) ) {
374  LOG_INFO << "Invalid Tree topology.";
375  return false;
376  }
377  if( ! tree_data_is< MassTreeNodeData, MassTreeEdgeData >( tree )) {
378  LOG_INFO << "Tree does not only contain Mass Node and Edge data types.";
379  return false;
380  }
381 
382  // Check masses.
383  double mass_sum = 0.0;
384  for( auto const& edge : tree.edges() ) {
385  auto const edge_data = dynamic_cast< MassTreeEdgeData const* >( edge->data_ptr() );
386  if( edge_data == nullptr ) {
387  LOG_INFO << "Edge data type is not 'MassTreeEdgeData'.";
388  return false;
389  }
390 
391  for( auto const& mass : edge_data->masses ) {
392  if( mass.first < 0.0 ) {
393  LOG_INFO << "Mass with branch position < 0.0";
394  return false;
395  }
396  if( mass.first > edge_data->branch_length ) {
397  LOG_INFO << "Mass with branch position > branch_length";
398  return false;
399  }
400 
401  mass_sum += mass.second;
402  }
403  }
404 
405  if( valid_total_mass_difference >= 0.0 && std::abs(mass_sum) > valid_total_mass_difference ) {
406  LOG_INFO << "Total mass difference " << mass_sum
407  << " is higher than " << valid_total_mass_difference;
408  return false;
409  }
410  return true;
411 }
412 
413 } // namespace tree
414 } // namespace genesis
size_t index() const
Return the index of this Edge.
Definition: edge.hpp:105
void mass_tree_merge_trees_inplace(MassTree &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees by adding them to the first MassTree.
void mass_tree_reverse_signs(MassTree &tree)
Reverse the sign of each mass point on a MassTree.
double mass_tree_sum_of_masses(MassTree const &tree)
Return the total sum of all masses on the MassTree.
Tree operator functions.
Data class for MassTreeEdges. Stores the branch length and a list of masses with their positions alon...
double mass_tree_center_masses_on_branches_averaged(MassTree &tree)
Accumulate all masses of a MassTree at the average mass position per edge.
std::vector< double > mass_tree_mass_per_edge(MassTree const &tree)
Return a std::vector that contains the total Mass for each edge of the given MassTree.
Matrix operators.
std::map< double, double > masses
List of masses stored on this branch, sorted by their position on the branch.
void mass_tree_transform_to_unit_branch_lengths(MassTree &tree)
Set all branch lengths of a MassTree to 1.0, while keeping the relative position of all masses on the...
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
void mass_trees_make_average_branch_lengths(std::vector< MassTree > &mass_trees)
Change the branch lengths of all trees to their average, and move the masses accordingly in a proport...
bool validate_topology(Tree const &tree)
Validate that all internal pointers of the Tree elements (TreeLinks, TreeNodes, TreeEdges) to each ot...
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
Provides easy and fast logging functionality.
void mass_tree_clear_masses(MassTree &tree)
Clear all masses of a MassTree, while keeping its topology.
bool mass_tree_all_identical_topology(std::vector< MassTree > const &mass_trees)
Return true iff all Trees in the vector have an identical topology.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
double branch_length
Branch length of the edge.
void mass_tree_scale_masses(MassTree &tree, double factor)
Scale all masses of a MassTree with the multiplicative factor factor.
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
Header of Tree class.
double mass_tree_center_masses_on_branches(MassTree &tree)
Accumulate all masses of a MassTree on the centers of their edges.
bool identical_topology(Tree const &lhs, Tree const &rhs)
Returns true iff both trees have an identical topology.
EdgeDataType & data()
Definition: edge.hpp:183
double mass_tree_binify_masses(MassTree &tree, size_t number_of_bins)
Accumulate all masses of a MassTree into bins on each branch.
bool mass_tree_validate(MassTree const &tree, double valid_total_mass_difference)
Validate the data on a MassTree.
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98