A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
squash_clustering.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cmath>
41 #include <stdexcept>
42 #include <utility>
43 #include <vector>
44 
45 #ifdef GENESIS_OPENMP
46 # include <omp.h>
47 #endif
48 
49 namespace genesis {
50 namespace tree {
51 
52 // =================================================================================================
53 // Squash Clustering
54 // =================================================================================================
55 
56 void SquashClustering::init_( std::vector<MassTree>&& trees )
57 {
58  // Clear. Both its clusters and mergeres are empty.
59  clear();
60 
61  // Move all trees as single data points to the cluster list, and make them active.
62  clusters_.resize( trees.size() );
63  for( size_t i = 0; i < trees.size(); ++i ) {
64  clusters_[i].tree = std::move( trees[i] );
65  clusters_[i].count = 1;
66  clusters_[i].active = true;
67  }
68 
69  // Fill the "lower triangle" of distances, i.e., all distances to elements with lower indices
70  // than the current one. We don't store this in a global distance matix, but in a vector
71  // for each cluster instead, as this makes it trivial to keep track of the data when merging
72  // clusters. No need to keep track of which row belongs to which cluster etc.
73  // We do this in a second loop, so that all trees haven been moved and OpenMP can access them.
74  for( size_t i = 0; i < clusters_.size(); ++i ) {
75 
76  // The cluster need i many distance entries, i.e., cluster 0 needs 0 entries,
77  // cluster 1 needs 1 entry (to compare it to cluster 0), and so forth.
78  clusters_[i].distances.resize( i );
79 
80  // Calculate the distances.
81  #pragma omp parallel for
82  for( size_t k = 0; k < i; ++k ) {
83  auto const dist = earth_movers_distance( clusters_[i].tree, clusters_[k].tree, p_ );
84  clusters_[i].distances[k] = dist;
85  }
86  }
87 }
88 
89 std::pair<size_t, size_t> SquashClustering::min_entry_() const
90 {
91  // Init.
92  size_t min_i = 0;
93  size_t min_j = 0;
94  double min_d = std::numeric_limits<double>::max();
95 
96  // Find min cell.
97  #pragma omp parallel for schedule(dynamic)
98  for( size_t i = 0; i < clusters_.size(); ++i ) {
99  if( ! clusters_[i].active ) {
100  continue;
101  }
102 
103  // We only need to check the "lower triangle".
104  assert( clusters_[i].distances.size() == i );
105  for( size_t j = 0; j < i; ++j ) {
106  if( ! clusters_[j].active ) {
107  continue;
108  }
109 
110  // Comparison.
111  #pragma omp flush( min_d )
112  if( clusters_[i].distances[j] < min_d ) {
113  #pragma omp critical( GENESIS_TREE_MASS_TREE_SQUASH_CLUSTERING_MIN_ENTRY_UPDATE )
114  {
115  if( clusters_[i].distances[j] < min_d ) {
116  min_i = i;
117  min_j = j;
118  min_d = clusters_[i].distances[j];
119  }
120  }
121  }
122  }
123  }
124 
125  // We return reverse order, so that i < j. This is just more intuitive to work with.
126  assert( min_i > min_j );
127  return { min_j, min_i };
128 }
129 
130 void SquashClustering::merge_clusters_( size_t i, size_t j )
131 {
132  assert( i < j );
133  assert( i < clusters_.size() && j < clusters_.size() );
134  assert( i < clusters_[j].distances.size() );
135 
136  // Make new cluster.
137  clusters_.emplace_back();
138  auto& new_cluster = clusters_.back();
139 
140  // Make a new cluster tree as the weighted average of both given trees.
141  auto const weight_i = static_cast<double>( clusters_[i].count );
142  auto const weight_j = static_cast<double>( clusters_[j].count );
143  new_cluster.tree = mass_tree_merge_trees(
144  clusters_[i].tree, clusters_[j].tree, weight_i, weight_j
145  );
146  mass_tree_normalize_masses( new_cluster.tree );
147 
148  // Old way: Scale masses, merge them, scale back.
149  // Scale both trees according to their weight (= count).
150  // mass_tree_scale_masses( clusters_[i].tree, static_cast<double>( clusters_[i].count ));
151  // mass_tree_scale_masses( clusters_[j].tree, static_cast<double>( clusters_[j].count ));
152  // new_cluster.tree = mass_tree_merge_trees( clusters_[i].tree, clusters_[j].tree );
153  //
154  // mass_tree_normalize_masses( new_cluster.tree );
155  // mass_tree_scale_masses( clusters_[i].tree, 1.0 / static_cast<double>( clusters_[i].count ));
156  // mass_tree_scale_masses( clusters_[j].tree, 1.0 / static_cast<double>( clusters_[j].count ));
157 
158  // Set other properties of the new cluster.
159  new_cluster.count = clusters_[i].count + clusters_[j].count;
160  new_cluster.active = true;
161  new_cluster.distances.resize( clusters_.size() - 1, 0.0 );
162 
163  // Calculate distances to still active clusters, which also includes the two clusters that
164  // we are about to merge. We will deactivate them after the loop. This way, we also compute
165  // their distances in parallel, maximizing OpenMP throughput!
166  #pragma omp parallel for schedule(dynamic)
167  for( size_t k = 0; k < clusters_.size() - 1; ++k ) {
168  if( ! clusters_[k].active ) {
169  continue;
170  }
171 
172  auto const dist = earth_movers_distance( new_cluster.tree, clusters_[k].tree, p_ );
173  new_cluster.distances[k] = dist;
174  }
175 
176  // Get the distance between the two clusters that we want to merge,
177  // and make a new cluster merger.
178  mergers_.push_back({ i, new_cluster.distances[i], j, new_cluster.distances[j] });
179 
180  // Test. How much does the avg dist differ from proper emd dist?
181  // We need to access the distance in reverse j i order, because of the lower triangle.
182  // auto const inner_dist = clusters_[j].distances[i];
183  // auto const avg_dist_i = inner_dist / static_cast<double>( clusters_[i].count );
184  // auto const avg_dist_j = inner_dist / static_cast<double>( clusters_[j].count );
185 
186  // Deactive. Those two clusters are now merged.
187  clusters_[i].active = false;
188  clusters_[j].active = false;
189 
190  // We don't need the distances any more. Save mem.
191  clusters_[i].distances.clear();
192  clusters_[j].distances.clear();
193 
194  // We can also destroy those trees. For now. Maybe later, we want to write them first.
195  // clusters_[i].tree.clear();
196  // clusters_[j].tree.clear();
197 }
198 
199 void SquashClustering::run( std::vector<MassTree>&& trees )
200 {
201  // Number of trees we are going to cluster.
202  auto const tree_count = trees.size();
203 
204  // Init the result object.
205  // LOG_INFO << "Squash Clustering: Initialize";
206  if( report_initialization ) {
208  }
209  init_( std::move( trees ) );
210 
211  // Do a full clustering, until only one is left.
212  for( size_t i = 0; i < tree_count - 1; ++i ) {
213  // LOG_INFO << "Squash Clustering: Step " << (i+1) << " of " << tree_count - 1;
214  if( report_step ) {
215  report_step( i + 1, tree_count - 1 );
216  }
217 
218  auto const min_pair = min_entry_();
219  assert( min_pair.first < min_pair.second );
220 
221  merge_clusters_( min_pair.first, min_pair.second );
222  }
223 
224  // At the end, we only have one big cluster node.
225  assert( 1 == [&](){
226  size_t count_active = 0;
227  for( auto const& c : clusters_ ) {
228  if( c.active ) {
229  ++count_active;
230  }
231  }
232  return count_active;
233  }() );
234 
235  // Furthermore, make sure we have created the right number of mergers and clusters.
236  assert( tree_count + mergers_.size() == clusters_.size() );
237 }
238 
239 std::string SquashClustering::tree_string( std::vector<std::string> const& labels ) const
240 {
241  if( labels.size() != clusters_.size() - mergers_.size() ) {
242  throw std::runtime_error(
243  "List of labels does not have the correct size for the number of squash cluster elements."
244  );
245  }
246 
247  auto list = labels;
248  for( size_t i = 0; i < mergers_.size(); ++i ) {
249  auto const& cm = mergers_[i];
250 
251  auto node_a = list[ cm.index_a ] + ":" + std::to_string( cm.distance_a );
252  auto node_b = list[ cm.index_b ] + ":" + std::to_string( cm.distance_b );
253 
254  list[ cm.index_a ].clear();
255  list[ cm.index_b ].clear();
256 
257  list.push_back( "(" + node_a + "," + node_b + ")" + std::to_string( i + labels.size() ) );
258  }
259 
260  return list.back() + ";";
261 }
262 
263 void SquashClustering::clear()
264 {
265  // p_ = 1.0;
266  clusters_.clear();
267  mergers_.clear();
268 }
269 
270 } // namespace tree
271 } // namespace genesis
std::function< void(void)> report_initialization
std::function< void(size_t i, size_t total)> report_step
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:381
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
Provides easy and fast logging functionality.
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
Definition: labels.cpp:62
void run(std::vector< MassTree > &&trees)
Perfom Squash Clustering.
void clear()
Clear the clusters() and mergers() data.
double earth_movers_distance(MassTree const &lhs, MassTree const &rhs, double const p)
Calculate the earth mover's distance of two distributions of masses on a given Tree.