A library for working with phylogenetic and population genetic data.
v0.32.0
squash_clustering.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cmath>
41 #include <stdexcept>
42 #include <utility>
43 #include <vector>
44 #include <limits>
45 
46 #ifdef GENESIS_OPENMP
47 # include <omp.h>
48 #endif
49 
50 namespace genesis {
51 namespace tree {
52 
53 // =================================================================================================
54 // Squash Clustering
55 // =================================================================================================
56 
57 void SquashClustering::init_( std::vector<MassTree>&& trees )
58 {
59  // Clear. Both its clusters and mergeres are empty.
60  clear();
61 
62  // Move all trees as single data points to the cluster list, and make them active.
63  clusters_.resize( trees.size() );
64  for( size_t i = 0; i < trees.size(); ++i ) {
65  clusters_[i].tree = std::move( trees[i] );
66  clusters_[i].count = 1;
67  clusters_[i].active = true;
68  }
69 
70  // Fill the "lower triangle" of distances, i.e., all distances to elements with lower indices
71  // than the current one. We don't store this in a global distance matix, but in a vector
72  // for each cluster instead, as this makes it trivial to keep track of the data when merging
73  // clusters. No need to keep track of which row belongs to which cluster etc.
74  // We do this in a second loop, so that all trees haven been moved and OpenMP can access them.
75  for( size_t i = 0; i < clusters_.size(); ++i ) {
76 
77  // The cluster need i many distance entries, i.e., cluster 0 needs 0 entries,
78  // cluster 1 needs 1 entry (to compare it to cluster 0), and so forth.
79  clusters_[i].distances.resize( i );
80 
81  // Calculate the distances.
82  #pragma omp parallel for
83  for( size_t k = 0; k < i; ++k ) {
84  auto const dist = earth_movers_distance( clusters_[i].tree, clusters_[k].tree, p_ );
85  clusters_[i].distances[k] = dist;
86  }
87 
88  // Also, write out the trees for user output if needed.
89  if( write_cluster_tree ) {
90  write_cluster_tree( clusters_[i].tree, i );
91  }
92  }
93 }
94 
95 std::pair<size_t, size_t> SquashClustering::min_entry_() const
96 {
97  // Init.
98  size_t min_i = 0;
99  size_t min_j = 0;
100  double min_d = std::numeric_limits<double>::max();
101 
102  // Find min cell.
103  #pragma omp parallel for schedule(dynamic)
104  for( size_t i = 0; i < clusters_.size(); ++i ) {
105  if( ! clusters_[i].active ) {
106  continue;
107  }
108 
109  // We only need to check the "lower triangle".
110  assert( clusters_[i].distances.size() == i );
111  for( size_t j = 0; j < i; ++j ) {
112  if( ! clusters_[j].active ) {
113  continue;
114  }
115 
116  // Comparison.
117  #pragma omp flush( min_d )
118  if( clusters_[i].distances[j] < min_d ) {
119  #pragma omp critical( GENESIS_TREE_MASS_TREE_SQUASH_CLUSTERING_MIN_ENTRY_UPDATE )
120  {
121  if( clusters_[i].distances[j] < min_d ) {
122  min_i = i;
123  min_j = j;
124  min_d = clusters_[i].distances[j];
125  }
126  }
127  }
128  }
129  }
130 
131  // We return reverse order, so that i < j. This is just more intuitive to work with.
132  assert( min_i > min_j );
133  return { min_j, min_i };
134 }
135 
136 void SquashClustering::merge_clusters_( size_t i, size_t j )
137 {
138  assert( i < j );
139  assert( i < clusters_.size() && j < clusters_.size() );
140  assert( i < clusters_[j].distances.size() );
141 
142  // Make new cluster.
143  clusters_.emplace_back();
144  auto& new_cluster = clusters_.back();
145 
146  // Make a new cluster tree as the weighted average of both given trees.
147  auto const weight_i = static_cast<double>( clusters_[i].count );
148  auto const weight_j = static_cast<double>( clusters_[j].count );
149  new_cluster.tree = mass_tree_merge_trees(
150  clusters_[i].tree, clusters_[j].tree, weight_i, weight_j
151  );
152  mass_tree_normalize_masses( new_cluster.tree );
153 
154  // If the user wants to write intermediate trees, to so now,
155  // so that we later can free the memory again.
156  if( write_cluster_tree ) {
157  assert( clusters_.size() > 0 );
158  write_cluster_tree( new_cluster.tree, clusters_.size() - 1 );
159  }
160 
161  // Old way: Scale masses, merge them, scale back.
162  // Scale both trees according to their weight (= count).
163  // mass_tree_scale_masses( clusters_[i].tree, static_cast<double>( clusters_[i].count ));
164  // mass_tree_scale_masses( clusters_[j].tree, static_cast<double>( clusters_[j].count ));
165  // new_cluster.tree = mass_tree_merge_trees( clusters_[i].tree, clusters_[j].tree );
166  //
167  // mass_tree_normalize_masses( new_cluster.tree );
168  // mass_tree_scale_masses( clusters_[i].tree, 1.0 / static_cast<double>( clusters_[i].count ));
169  // mass_tree_scale_masses( clusters_[j].tree, 1.0 / static_cast<double>( clusters_[j].count ));
170 
171  // Set other properties of the new cluster.
172  new_cluster.count = clusters_[i].count + clusters_[j].count;
173  new_cluster.active = true;
174  new_cluster.distances.resize( clusters_.size() - 1, 0.0 );
175 
176  // Calculate distances to still active clusters, which also includes the two clusters that
177  // we are about to merge. We will deactivate them after the loop. This way, we also compute
178  // their distances in parallel, maximizing OpenMP throughput!
179  #pragma omp parallel for schedule(dynamic)
180  for( size_t k = 0; k < clusters_.size() - 1; ++k ) {
181  if( ! clusters_[k].active ) {
182  continue;
183  }
184 
185  auto const dist = earth_movers_distance( new_cluster.tree, clusters_[k].tree, p_ );
186  new_cluster.distances[k] = dist;
187  }
188 
189  // Get the distance between the two clusters that we want to merge,
190  // and make a new cluster merger.
191  mergers_.push_back({ i, new_cluster.distances[i], j, new_cluster.distances[j] });
192 
193  // Test. How much does the avg dist differ from proper emd dist?
194  // We need to access the distance in reverse j i order, because of the lower triangle.
195  // auto const inner_dist = clusters_[j].distances[i];
196  // auto const avg_dist_i = inner_dist / static_cast<double>( clusters_[i].count );
197  // auto const avg_dist_j = inner_dist / static_cast<double>( clusters_[j].count );
198 
199  // Deactive. Those two clusters are now merged.
200  clusters_[i].active = false;
201  clusters_[j].active = false;
202 
203  // We don't need the distances any more. Save mem.
204  clusters_[i].distances.clear();
205  clusters_[j].distances.clear();
206 
207  // We can also destroy those trees. They haven been written before, and are not needed any more.
208  clusters_[i].tree.clear();
209  clusters_[j].tree.clear();
210 }
211 
212 void SquashClustering::run( std::vector<MassTree>&& trees )
213 {
214  // Number of trees we are going to cluster.
215  auto const tree_count = trees.size();
216 
217  // Init the result object.
218  // LOG_INFO << "Squash Clustering: Initialize";
219  if( report_initialization ) {
221  }
222  init_( std::move( trees ) );
223 
224  // Do a full clustering, until only one is left.
225  for( size_t i = 0; i < tree_count - 1; ++i ) {
226  // LOG_INFO << "Squash Clustering: Step " << (i+1) << " of " << tree_count - 1;
227  if( report_step ) {
228  report_step( i + 1, tree_count - 1 );
229  }
230 
231  auto const min_pair = min_entry_();
232  assert( min_pair.first < min_pair.second );
233 
234  merge_clusters_( min_pair.first, min_pair.second );
235  }
236 
237  // At the end, we only have one big cluster node.
238  assert( 1 == [&](){
239  size_t count_active = 0;
240  for( auto const& c : clusters_ ) {
241  if( c.active ) {
242  ++count_active;
243  }
244  }
245  return count_active;
246  }() );
247 
248  // Furthermore, make sure we have created the right number of mergers and clusters.
249  assert( tree_count + mergers_.size() == clusters_.size() );
250 }
251 
252 std::string SquashClustering::tree_string( std::vector<std::string> const& labels ) const
253 {
254  if( labels.size() != clusters_.size() - mergers_.size() ) {
255  throw std::runtime_error(
256  "List of labels does not have the correct size for the number of squash cluster elements."
257  );
258  }
259 
260  auto list = labels;
261  for( size_t i = 0; i < mergers_.size(); ++i ) {
262  auto const& cm = mergers_[i];
263 
264  auto node_a = list[ cm.index_a ] + ":" + std::to_string( cm.distance_a );
265  auto node_b = list[ cm.index_b ] + ":" + std::to_string( cm.distance_b );
266 
267  list[ cm.index_a ].clear();
268  list[ cm.index_b ].clear();
269 
270  list.push_back( "(" + node_a + "," + node_b + ")" + std::to_string( i + labels.size() ) );
271  }
272 
273  return list.back() + ";";
274 }
275 
277 {
278  // p_ = 1.0;
279  clusters_.clear();
280  mergers_.clear();
281 }
282 
283 } // namespace tree
284 } // namespace genesis
genesis::tree::SquashClustering::clear
void clear()
Clear the clusters() and mergers() data.
Definition: squash_clustering.cpp:276
emd.hpp
genesis::tree::SquashClustering::write_cluster_tree
std::function< void(MassTree const &cluster_tree, size_t index)> write_cluster_tree
Definition: squash_clustering.hpp:191
genesis::tree::mass_tree_merge_trees
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
Definition: tree/mass_tree/functions.cpp:60
squash_clustering.hpp
genesis::tree::SquashClustering::report_step
std::function< void(size_t i, size_t total)> report_step
Definition: squash_clustering.hpp:189
genesis::tree::SquashClustering::tree_string
std::string tree_string(std::vector< std::string > const &labels) const
Build a Newick-format tree for visualizing the result of a squash_clustering().
Definition: squash_clustering.cpp:252
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::tree::SquashClustering::run
void run(std::vector< MassTree > &&trees)
Perfom Squash Clustering.
Definition: squash_clustering.cpp:212
logging.hpp
Provides easy and fast logging functionality.
genesis::sequence::labels
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
Definition: labels.cpp:64
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::tree::earth_movers_distance
double earth_movers_distance(MassTree const &lhs, MassTree const &rhs, double const p)
Calculate the earth mover's distance of two distributions of masses on a given Tree.
Definition: tree/mass_tree/emd.cpp:60
functions.hpp
genesis::tree::SquashClustering::report_initialization
std::function< void(void)> report_initialization
Definition: squash_clustering.hpp:188
genesis::tree::mass_tree_normalize_masses
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
Definition: tree/mass_tree/functions.cpp:126