A toolkit for working with phylogenetic data.
v0.24.0
squash_clustering.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cmath>
41 #include <stdexcept>
42 #include <utility>
43 #include <vector>
44 
45 #ifdef GENESIS_OPENMP
46 # include <omp.h>
47 #endif
48 
49 namespace genesis {
50 namespace tree {
51 
52 // =================================================================================================
53 // Squash Clustering
54 // =================================================================================================
55 
56 void SquashClustering::init_( std::vector<MassTree>&& trees )
57 {
58  // Clear. Both its clusters and mergeres are empty.
59  clear();
60 
61  // Move all trees as single data points to the cluster list, and make them active.
62  clusters_.resize( trees.size() );
63  for( size_t i = 0; i < trees.size(); ++i ) {
64  clusters_[i].tree = std::move( trees[i] );
65  clusters_[i].count = 1;
66  clusters_[i].active = true;
67  }
68 
69  // Fill the "lower triangle" of distances, i.e., all distances to elements with lower indices
70  // than the current one. We don't store this in a global distance matix, but in a vector
71  // for each cluster instead, as this makes it trivial to keep track of the data when merging
72  // clusters. No need to keep track of which row belongs to which cluster etc.
73  // We do this in a second loop, so that all trees haven been moved and OpenMP can access them.
74  for( size_t i = 0; i < clusters_.size(); ++i ) {
75 
76  // The cluster need i many distance entries, i.e., cluster 0 needs 0 entries,
77  // cluster 1 needs 1 entry (to compare it to cluster 0), and so forth.
78  clusters_[i].distances.resize( i );
79 
80  // Calculate the distances.
81  #pragma omp parallel for
82  for( size_t k = 0; k < i; ++k ) {
83  auto const dist = earth_movers_distance( clusters_[i].tree, clusters_[k].tree, p_ );
84  clusters_[i].distances[k] = dist;
85  }
86 
87  // Also, write out the trees for user output if needed.
88  if( write_cluster_tree ) {
89  write_cluster_tree( clusters_[i].tree, i );
90  }
91  }
92 }
93 
94 std::pair<size_t, size_t> SquashClustering::min_entry_() const
95 {
96  // Init.
97  size_t min_i = 0;
98  size_t min_j = 0;
99  double min_d = std::numeric_limits<double>::max();
100 
101  // Find min cell.
102  #pragma omp parallel for schedule(dynamic)
103  for( size_t i = 0; i < clusters_.size(); ++i ) {
104  if( ! clusters_[i].active ) {
105  continue;
106  }
107 
108  // We only need to check the "lower triangle".
109  assert( clusters_[i].distances.size() == i );
110  for( size_t j = 0; j < i; ++j ) {
111  if( ! clusters_[j].active ) {
112  continue;
113  }
114 
115  // Comparison.
116  #pragma omp flush( min_d )
117  if( clusters_[i].distances[j] < min_d ) {
118  #pragma omp critical( GENESIS_TREE_MASS_TREE_SQUASH_CLUSTERING_MIN_ENTRY_UPDATE )
119  {
120  if( clusters_[i].distances[j] < min_d ) {
121  min_i = i;
122  min_j = j;
123  min_d = clusters_[i].distances[j];
124  }
125  }
126  }
127  }
128  }
129 
130  // We return reverse order, so that i < j. This is just more intuitive to work with.
131  assert( min_i > min_j );
132  return { min_j, min_i };
133 }
134 
135 void SquashClustering::merge_clusters_( size_t i, size_t j )
136 {
137  assert( i < j );
138  assert( i < clusters_.size() && j < clusters_.size() );
139  assert( i < clusters_[j].distances.size() );
140 
141  // Make new cluster.
142  clusters_.emplace_back();
143  auto& new_cluster = clusters_.back();
144 
145  // Make a new cluster tree as the weighted average of both given trees.
146  auto const weight_i = static_cast<double>( clusters_[i].count );
147  auto const weight_j = static_cast<double>( clusters_[j].count );
148  new_cluster.tree = mass_tree_merge_trees(
149  clusters_[i].tree, clusters_[j].tree, weight_i, weight_j
150  );
151  mass_tree_normalize_masses( new_cluster.tree );
152 
153  // If the user wants to write intermediate trees, to so now,
154  // so that we later can free the memory again.
155  if( write_cluster_tree ) {
156  assert( clusters_.size() > 0 );
157  write_cluster_tree( new_cluster.tree, clusters_.size() - 1 );
158  }
159 
160  // Old way: Scale masses, merge them, scale back.
161  // Scale both trees according to their weight (= count).
162  // mass_tree_scale_masses( clusters_[i].tree, static_cast<double>( clusters_[i].count ));
163  // mass_tree_scale_masses( clusters_[j].tree, static_cast<double>( clusters_[j].count ));
164  // new_cluster.tree = mass_tree_merge_trees( clusters_[i].tree, clusters_[j].tree );
165  //
166  // mass_tree_normalize_masses( new_cluster.tree );
167  // mass_tree_scale_masses( clusters_[i].tree, 1.0 / static_cast<double>( clusters_[i].count ));
168  // mass_tree_scale_masses( clusters_[j].tree, 1.0 / static_cast<double>( clusters_[j].count ));
169 
170  // Set other properties of the new cluster.
171  new_cluster.count = clusters_[i].count + clusters_[j].count;
172  new_cluster.active = true;
173  new_cluster.distances.resize( clusters_.size() - 1, 0.0 );
174 
175  // Calculate distances to still active clusters, which also includes the two clusters that
176  // we are about to merge. We will deactivate them after the loop. This way, we also compute
177  // their distances in parallel, maximizing OpenMP throughput!
178  #pragma omp parallel for schedule(dynamic)
179  for( size_t k = 0; k < clusters_.size() - 1; ++k ) {
180  if( ! clusters_[k].active ) {
181  continue;
182  }
183 
184  auto const dist = earth_movers_distance( new_cluster.tree, clusters_[k].tree, p_ );
185  new_cluster.distances[k] = dist;
186  }
187 
188  // Get the distance between the two clusters that we want to merge,
189  // and make a new cluster merger.
190  mergers_.push_back({ i, new_cluster.distances[i], j, new_cluster.distances[j] });
191 
192  // Test. How much does the avg dist differ from proper emd dist?
193  // We need to access the distance in reverse j i order, because of the lower triangle.
194  // auto const inner_dist = clusters_[j].distances[i];
195  // auto const avg_dist_i = inner_dist / static_cast<double>( clusters_[i].count );
196  // auto const avg_dist_j = inner_dist / static_cast<double>( clusters_[j].count );
197 
198  // Deactive. Those two clusters are now merged.
199  clusters_[i].active = false;
200  clusters_[j].active = false;
201 
202  // We don't need the distances any more. Save mem.
203  clusters_[i].distances.clear();
204  clusters_[j].distances.clear();
205 
206  // We can also destroy those trees. They haven been written before, and are not needed any more.
207  clusters_[i].tree.clear();
208  clusters_[j].tree.clear();
209 }
210 
211 void SquashClustering::run( std::vector<MassTree>&& trees )
212 {
213  // Number of trees we are going to cluster.
214  auto const tree_count = trees.size();
215 
216  // Init the result object.
217  // LOG_INFO << "Squash Clustering: Initialize";
218  if( report_initialization ) {
220  }
221  init_( std::move( trees ) );
222 
223  // Do a full clustering, until only one is left.
224  for( size_t i = 0; i < tree_count - 1; ++i ) {
225  // LOG_INFO << "Squash Clustering: Step " << (i+1) << " of " << tree_count - 1;
226  if( report_step ) {
227  report_step( i + 1, tree_count - 1 );
228  }
229 
230  auto const min_pair = min_entry_();
231  assert( min_pair.first < min_pair.second );
232 
233  merge_clusters_( min_pair.first, min_pair.second );
234  }
235 
236  // At the end, we only have one big cluster node.
237  assert( 1 == [&](){
238  size_t count_active = 0;
239  for( auto const& c : clusters_ ) {
240  if( c.active ) {
241  ++count_active;
242  }
243  }
244  return count_active;
245  }() );
246 
247  // Furthermore, make sure we have created the right number of mergers and clusters.
248  assert( tree_count + mergers_.size() == clusters_.size() );
249 }
250 
251 std::string SquashClustering::tree_string( std::vector<std::string> const& labels ) const
252 {
253  if( labels.size() != clusters_.size() - mergers_.size() ) {
254  throw std::runtime_error(
255  "List of labels does not have the correct size for the number of squash cluster elements."
256  );
257  }
258 
259  auto list = labels;
260  for( size_t i = 0; i < mergers_.size(); ++i ) {
261  auto const& cm = mergers_[i];
262 
263  auto node_a = list[ cm.index_a ] + ":" + std::to_string( cm.distance_a );
264  auto node_b = list[ cm.index_b ] + ":" + std::to_string( cm.distance_b );
265 
266  list[ cm.index_a ].clear();
267  list[ cm.index_b ].clear();
268 
269  list.push_back( "(" + node_a + "," + node_b + ")" + std::to_string( i + labels.size() ) );
270  }
271 
272  return list.back() + ";";
273 }
274 
276 {
277  // p_ = 1.0;
278  clusters_.clear();
279  mergers_.clear();
280 }
281 
282 } // namespace tree
283 } // namespace genesis
std::function< void(void)> report_initialization
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
Definition: labels.cpp:64
double earth_movers_distance(MassTree const &lhs, MassTree const &rhs, double const p)
Calculate the earth mover&#39;s distance of two distributions of masses on a given Tree.
std::function< void(size_t i, size_t total)> report_step
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string tree_string(std::vector< std::string > const &labels) const
Build a Newick-format tree for visualizing the result of a squash_clustering().
std::function< void(MassTree const &cluster_tree, size_t index)> write_cluster_tree
Provides easy and fast logging functionality.
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
void run(std::vector< MassTree > &&trees)
Perfom Squash Clustering.
void clear()
Clear the clusters() and mergers() data.