A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/mass_tree/kmeans.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <cassert>
40 #include <stdexcept>
41 #include <vector>
42 
43 #ifdef GENESIS_OPENMP
44 # include <omp.h>
45 #endif
46 
47 namespace genesis {
48 namespace tree {
49 
50 // =================================================================================================
51 // Settings
52 // =================================================================================================
53 
55 {
56  return accumulate_centroid_masses_;
57 }
58 
60 {
61  accumulate_centroid_masses_ = value;
62 }
63 
64 // =================================================================================================
65 // Mass Tree Kmeans
66 // =================================================================================================
67 
68 void MassTreeKmeans::pre_loop_hook(
69  std::vector<Point> const& data,
70  std::vector<size_t>& assignments,
71  std::vector<Point>& centroids
72 ) {
73  (void) data;
74  (void) assignments;
75 
76  if( accumulate_centroid_masses_ == 1 ) {
77  for( auto& centroid : centroids ) {
79  }
80  } else if( accumulate_centroid_masses_ > 1 ) {
81  for( auto& centroid : centroids ) {
82  mass_tree_binify_masses( centroid, accumulate_centroid_masses_ );
83  }
84  }
85 }
86 
87 bool MassTreeKmeans::data_validation( std::vector<Point> const& data ) const
88 {
89  // Check if all Trees have the correct data types.
90  for( auto const& tree : data ) {
91  if( ! tree_data_is< MassTreeNodeData, MassTreeEdgeData >( tree ) ) {
92  throw std::invalid_argument( "Trees for Kmeans do not have MassTree data types." );
93  }
94  }
95 
96  // Check if all Trees have the same topology. Important to caluclate EMD.
97  // adapted from all_identical_topology()
98  for (size_t i = 1; i < data.size(); i++) {
99  if( ! identical_topology( data[i-1], data[i] )) {
100  throw std::invalid_argument( "Trees for Kmeans do not have identical topologies." );
101  }
102  }
103 
104  return true;
105 }
106 
107 void MassTreeKmeans::update_centroids(
108  std::vector<Point> const& data,
109  std::vector<size_t> const& assignments,
110  std::vector<Point>& centroids
111 ) {
112  // Shorthand.
113  auto const k = centroids.size();
114 
115  // Clear all centroid masses from previous iteration.
116  #pragma omp parallel for
117  for( size_t c = 0; c < k; ++c ) {
118  mass_tree_clear_masses( centroids[ c ] );
119  }
120 
121  // This function is only called from within the run() function, which already
122  // checks this condition. So, simply assert it here, instead of throwing.
123  assert( data.size() == assignments.size() );
124 
125  #ifdef GENESIS_OPENMP
126 
127  // Parallelize over centroids
128  #pragma omp parallel for
129  for( size_t c = 0; c < k; ++c ) {
130 
131  // Thread-local data.
132  auto& centroid = centroids[ c ];
133  size_t count = 0;
134 
135  // Work through the data and assigments and accumulate...
136  for( size_t d = 0; d < data.size(); ++d ) {
137 
138  // ( Check correct assignments. )
139  assert( assignments[ d ] < k );
140 
141  // ... but only the relevant parts.
142  if( assignments[ d ] != c ) {
143  continue;
144  }
145 
146  // Accumulate centroid.
147  mass_tree_merge_trees_inplace( centroid, data[ d ] );
148  ++count;
149  }
150 
151  // Make sure that the sum of masses is okay. This is a bit wibbly wobbly because
152  // of the double equality check, but we have to live with it.
154  count, mass_tree_sum_of_masses( centroid ), 0.00001
155  ));
156 
157  // Put in bins for speedup, and normalize.
158  if( accumulate_centroid_masses_ == 1 ) {
160  } else if( accumulate_centroid_masses_ > 1 ) {
161  mass_tree_binify_masses( centroid, accumulate_centroid_masses_ );
162  }
163  mass_tree_normalize_masses( centroid );
164  }
165 
166  #else
167 
168  // In the serial case, we only want to traverse the data once.
169 
170  // Count how many mass trees are accumulated per centroid.
171  auto counts = std::vector<size_t>( k, 0 );
172 
173  // Work through the data and assigments and accumulate.
174  for( size_t d = 0; d < data.size(); ++d ) {
175 
176  // Check correct assignments.
177  assert( assignments[ d ] < k );
178 
179  // Accumulate centroid.
180  mass_tree_merge_trees_inplace( centroids[ assignments[ d ] ], data[ d ] );
181  ++counts[ assignments[ d ] ];
182  }
183 
184  // Normalize the centroids.
185  for( size_t c = 0; c < k; ++c ) {
186 
187  // Make sure that the sum of masses is okay. This is a bit wibbly wobbly because
188  // of the double equality check, but we have to live with it.
190  counts[ c ], mass_tree_sum_of_masses( centroids[ c ] ), 0.00001
191  ));
192 
193  // Put in bins for speedup, and normalize.
194  if( accumulate_centroid_masses_ == 1 ) {
196  } else if( accumulate_centroid_masses_ > 1 ) {
197  mass_tree_binify_masses( centroids[ c ], accumulate_centroid_masses_ );
198  }
199  mass_tree_normalize_masses( centroids[ c ] );
200  }
201 
202  #endif
203 }
204 
205 double MassTreeKmeans::distance( Point const& lhs, Point const& rhs ) const
206 {
207  return earth_movers_distance( lhs, rhs );
208 }
209 
210 } // namespace tree
211 } // namespace genesis
void mass_tree_merge_trees_inplace(MassTree &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees by adding them to the first MassTree.
double mass_tree_sum_of_masses(MassTree const &tree)
Return the total sum of all masses on the MassTree.
Tree operator functions.
double mass_tree_center_masses_on_branches_averaged(MassTree &tree)
Accumulate all masses of a MassTree at the average mass position per edge.
void mass_tree_clear_masses(MassTree &tree)
Clear all masses of a MassTree, while keeping its topology.
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
bool identical_topology(Tree const &lhs, Tree const &rhs)
Returns true iff both trees have an identical topology.
double mass_tree_binify_masses(MassTree &tree, size_t number_of_bins)
Accumulate all masses of a MassTree into bins on each branch.
double earth_movers_distance(MassTree const &lhs, MassTree const &rhs, double const p)
Calculate the earth mover's distance of two distributions of masses on a given Tree.
bool almost_equal_relative(double lhs, double rhs, double max_rel_diff=std::numeric_limits< double >::epsilon())
Check whether two doubles are almost equal, using a relative epsilon to compare them.
Definition: common.hpp:108