A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/mass_tree/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 #include "genesis/tree/tree.hpp"
37 
41 
42 #include <algorithm>
43 #include <cassert>
44 #include <cmath>
45 #include <stdexcept>
46 #include <utility>
47 #include <vector>
48 
49 #ifdef GENESIS_OPENMP
50 # include <omp.h>
51 #endif
52 
53 namespace genesis {
54 namespace tree {
55 
56 // =================================================================================================
57 // Manipulate Masses
58 // =================================================================================================
59 
61  MassTree const& lhs,
62  MassTree const& rhs,
63  double const scaler_lhs,
64  double const scaler_rhs
65 ) {
66  auto copy = lhs;
67  mass_tree_merge_trees_inplace( copy, rhs, scaler_lhs, scaler_rhs );
68  return copy;
69 }
70 
72  MassTree& lhs,
73  MassTree const& rhs,
74  double const scaler_lhs,
75  double const scaler_rhs
76 ) {
77  // Do at least a basic check.
78  if( lhs.edge_count() != rhs.edge_count() ) {
79  throw std::runtime_error( "Incompatible MassTrees for merging." );
80  }
81 
82  // Only do the work if needed.
83  if( scaler_lhs != 1.0 ) {
84  mass_tree_scale_masses( lhs, scaler_lhs );
85  }
86 
87  #pragma omp parallel for
88  for( size_t i = 0; i < lhs.edge_count(); ++i ) {
89  auto& lhs_masses = lhs.edge_at( i ).data<MassTreeEdgeData>().masses;
90  for( auto const& rhs_mass : rhs.edge_at( i ).data<MassTreeEdgeData>().masses ) {
91  lhs_masses[ rhs_mass.first ] += scaler_rhs * rhs_mass.second;
92  }
93  }
94 }
95 
97 {
98  for( auto& edge : tree.edges() ) {
99  edge->data<MassTreeEdgeData>().masses.clear();
100  }
101 }
102 
104 {
105  #pragma omp parallel for
106  for( size_t i = 0; i < tree.edge_count(); ++i ) {
107  auto& masses = tree.edge_at( i ).data<MassTreeEdgeData>().masses;
108  for( auto& mass : masses ) {
109  mass.second = -mass.second;
110  }
111  }
112 }
113 
114 void mass_tree_scale_masses( MassTree& tree, double factor )
115 {
116  #pragma omp parallel for
117  for( size_t i = 0; i < tree.edge_count(); ++i ) {
118  auto& masses = tree.edge_at( i ).data<MassTreeEdgeData>().masses;
119  for( auto& mass : masses ) {
120  mass.second *= factor;
121  }
122  }
123 }
124 
126 {
127  double const total_mass = mass_tree_sum_of_masses( tree );
128 
129  if( total_mass == 0.0 ) {
130  return;
131  }
132 
133  for( auto& edge : tree.edges() ) {
134  for( auto& mass : edge->data<MassTreeEdgeData>().masses ) {
135  mass.second /= total_mass;
136  }
137  }
138 }
139 
141 {
142  for( auto& edge : tree.edges() ) {
143  auto& edge_data = edge->data<MassTreeEdgeData>();
144  std::map<double, double> relative;
145 
146  for( auto& mass : edge_data.masses ) {
147  relative[ mass.first / edge_data.branch_length ] += mass.second;
148  }
149 
150  edge_data.masses = relative;
151  edge_data.branch_length = 1.0;
152  }
153 }
154 
156 {
157  double work = 0.0;
158  for( auto& edge : tree.edges() ) {
159  auto& edge_data = edge->data<MassTreeEdgeData>();
160 
161  double const branch_center = edge_data.branch_length / 2;
162  double central_mass = 0.0;
163 
164  for( auto const& mass : edge_data.masses ) {
165  work += mass.second * std::abs( branch_center - mass.first );
166  central_mass += mass.second;
167  }
168 
169  edge_data.masses.clear();
170  edge_data.masses[ branch_center ] = central_mass;
171  }
172  return work;
173 }
174 
176 {
177  double work = 0.0;
178  for( auto& edge : tree.edges() ) {
179  auto& edge_data = edge->data<MassTreeEdgeData>();
180 
181  // No masses on the edge. We need to skip the rest, otherwise we end up having a nan values
182  // as mass centers, which leads to nan earth mover distance values, which leads to invalid
183  // kmeans cluster centroid assigments, which leads to crashes. What a stupid bug that was.
184  if( edge_data.masses.empty() ) {
185  continue;
186  }
187 
188  double mass_center = 0.0;
189  double mass_sum = 0.0;
190 
191  // Accumulate the mass center by adding the weighted positions,
192  // and accumulate the total sum of weights.
193  for( auto const& mass : edge_data.masses ) {
194  mass_center += mass.first * mass.second;
195  mass_sum += mass.second;
196  }
197 
198  // Find average mass center by dividing by weight sum.
199  mass_center /= mass_sum;
200 
201  // Calculate work.
202  for( auto const& mass : edge_data.masses ) {
203  work += mass.second * std::abs( mass_center - mass.first );
204  }
205 
206  // Set the new mass at the mass center.
207  edge_data.masses.clear();
208  edge_data.masses[ mass_center ] = mass_sum;
209  }
210  return work;
211 }
212 
213 double mass_tree_binify_masses( MassTree& tree, size_t number_of_bins )
214 {
215  if( number_of_bins == 0 ) {
216  throw std::invalid_argument( "Cannot use number_of_bins == 0." );
217  }
218 
219  // Take a number and return the closest bin position. The bins are found as the mid points
220  // of equally sized intervals distributed over the branch length, with number_of_bins being
221  // the number of those intervals/bins.
222  auto get_bin_pos = [ number_of_bins ]( double pos, double bl )
223  {
224  auto const nb = static_cast<double>( number_of_bins );
225 
226  // Trim and scale pos to be in interval [ 0.0, nb )
227  auto const pn = std::min( std::max( 0.0, pos / bl * nb ), std::nextafter( nb, 0.0 ));
228 
229  // Floor it to get to the interval start, then scale back, and add half the interval size,
230  // so that we end up at the mid point of the interval.
231  return ( std::floor( pn ) * bl / nb ) + ( bl / nb / 2.0 );
232  };
233 
234  double work = 0.0;
235  for( auto& edge : tree.edges() ) {
236 
237  // Shorthands.
238  auto& edge_data = edge->data<MassTreeEdgeData>();
239  auto new_masses = std::map<double, double>();
240 
241  // Accumulate masses at the closest bins, and accumulate the work needed to do so.
242  for( auto const& mass : edge_data.masses ) {
243  auto const bin = get_bin_pos( mass.first, edge_data.branch_length );
244 
245  work += mass.second * std::abs( bin - mass.first );
246  new_masses[ bin ] += mass.second;
247  }
248 
249  // Replace masses by new accumuated ones.
250  edge_data.masses = new_masses;
251  }
252 
253  return work;
254 }
255 
256 // =================================================================================================
257 // Others
258 // =================================================================================================
259 
260 std::vector<double> mass_tree_mass_per_edge( MassTree const& tree )
261 {
262  auto result = std::vector<double>( tree.edge_count(), 0.0 );
263 
264  for( auto const& edge : tree.edges() ) {
265  auto const& idx = edge->index();
266  for( auto const& mass : edge->data<MassTreeEdgeData>().masses ) {
267  result[ idx ] += mass.second;
268  }
269  }
270 
271  return result;
272 }
273 
274 double mass_tree_sum_of_masses( MassTree const& tree )
275 {
276  double total_mass = 0.0;
277  for( auto const& edge : tree.edges() ) {
278  for( auto const& mass : edge->data<MassTreeEdgeData>().masses ) {
279  total_mass += mass.second;
280  }
281  }
282  return total_mass;
283 }
284 
285 bool mass_tree_validate( MassTree const& tree, double valid_total_mass_difference )
286 {
287  // Check tree.
288  if( ! validate_topology( tree ) ) {
289  LOG_INFO << "Invalid Tree topology.";
290  return false;
291  }
292  if( ! tree_data_is< MassTreeNodeData, MassTreeEdgeData >( tree )) {
293  LOG_INFO << "Tree does not only contain Mass Node and Edge data types.";
294  return false;
295  }
296 
297  // Check masses.
298  double mass_sum = 0.0;
299  for( auto const& edge : tree.edges() ) {
300  auto const edge_data = dynamic_cast< MassTreeEdgeData const* >( edge->data_ptr() );
301  if( edge_data == nullptr ) {
302  LOG_INFO << "Edge data type is not 'MassTreeEdgeData'.";
303  return false;
304  }
305 
306  for( auto const& mass : edge_data->masses ) {
307  if( mass.first < 0.0 ) {
308  LOG_INFO << "Mass with branch position < 0.0";
309  return false;
310  }
311  if( mass.first > edge_data->branch_length ) {
312  LOG_INFO << "Mass with branch position > branch_length";
313  return false;
314  }
315 
316  mass_sum += mass.second;
317  }
318  }
319 
320  if( valid_total_mass_difference >= 0.0 && std::abs(mass_sum) > valid_total_mass_difference ) {
321  LOG_INFO << "Total mass difference " << mass_sum
322  << " is higher than " << valid_total_mass_difference;
323  return false;
324  }
325  return true;
326 }
327 
328 } // namespace tree
329 } // namespace genesis
void mass_tree_merge_trees_inplace(MassTree &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees by adding them to the first MassTree.
void mass_tree_reverse_signs(MassTree &tree)
Reverse the sign of each mass point on a MassTree.
double mass_tree_sum_of_masses(MassTree const &tree)
Return the total sum of all masses on the MassTree.
Tree operator functions.
Data class for MassTreeEdges. Stores the branch length and a list of masses with their positions alon...
double mass_tree_center_masses_on_branches_averaged(MassTree &tree)
Accumulate all masses of a MassTree at the average mass position per edge.
std::vector< double > mass_tree_mass_per_edge(MassTree const &tree)
Return a std::vector that contains the total Mass for each edge of the given MassTree.
std::map< double, double > masses
List of masses stored on this branch, sorted by their position on the branch.
void mass_tree_transform_to_unit_branch_lengths(MassTree &tree)
Set all branch lengths of a MassTree to 1.0, while keeping the relative position of all masses on the...
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
bool validate_topology(Tree const &tree)
Validate that all internal pointers of the Tree elements (TreeLinks, TreeNodes, TreeEdges) to each ot...
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
Provides easy and fast logging functionality.
void mass_tree_clear_masses(MassTree &tree)
Clear all masses of a MassTree, while keeping its topology.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
double branch_length
Branch length of the edge.
void mass_tree_scale_masses(MassTree &tree, double factor)
Scale all masses of a MassTree with the multiplicative factor factor.
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
Matrix operators.
Header of Tree class.
double mass_tree_center_masses_on_branches(MassTree &tree)
Accumulate all masses of a MassTree on the centers of their edges.
EdgeDataType & data()
Definition: edge.hpp:118
double mass_tree_binify_masses(MassTree &tree, size_t number_of_bins)
Accumulate all masses of a MassTree into bins on each branch.
bool mass_tree_validate(MassTree const &tree, double valid_total_mass_difference)
Validate the data on a MassTree.
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98