A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
squash_clustering.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cmath>
41 #include <stdexcept>
42 #include <utility>
43 #include <vector>
44 
45 #ifdef GENESIS_OPENMP
46 # include <omp.h>
47 #endif
48 
49 namespace genesis {
50 namespace tree {
51 
52 // =================================================================================================
53 // Squash Clustering
54 // =================================================================================================
55 
56 SquashClustering squash_clustering_init( std::vector<MassTree>&& trees, double const p )
57 {
58  // Create empty result object. Both its clusters and mergeres are empty.
60 
61  // Move all trees as single data points to the cluster list, and make them active.
62  sc.clusters.resize( trees.size() );
63  for( size_t i = 0; i < trees.size(); ++i ) {
64  sc.clusters[i].tree = std::move( trees[i] );
65  sc.clusters[i].count = 1;
66  sc.clusters[i].active = true;
67  }
68 
69  // Fill the "lower triangle" of distances, i.e., all distances to elements with lower indices
70  // than the current one. We don't store this in a global distance matix, but in a vector
71  // for each cluster instead, as this makes it trivial to keep track of the data when merging
72  // clusters. No need to keep track of which row belongs to which cluster etc.
73  // We do this in a second loop, so that all trees haven been moved and OpenMP can access them.
74  for( size_t i = 0; i < sc.clusters.size(); ++i ) {
75 
76  // The cluster need i many distance entries, i.e., cluster 0 needs 0 entries,
77  // cluster 1 needs 1 entry (to compare it to cluster 0), and so forth.
78  sc.clusters[i].distances.resize( i );
79 
80  // Calculate the distances.
81  #pragma omp parallel for
82  for( size_t k = 0; k < i; ++k ) {
83  auto const dist = earth_movers_distance( sc.clusters[i].tree, sc.clusters[k].tree, p );
84  sc.clusters[i].distances[k] = dist;
85  }
86  }
87 
88  return sc;
89 }
90 
91 std::pair<size_t, size_t> squash_clustering_min_entry( SquashClustering const& sc )
92 {
93  // Init.
94  size_t min_i = 0;
95  size_t min_j = 0;
96  double min_d = std::numeric_limits<double>::max();
97 
98  // Find min cell.
99  #pragma omp parallel for schedule(dynamic)
100  for( size_t i = 0; i < sc.clusters.size(); ++i ) {
101  if( ! sc.clusters[i].active ) {
102  continue;
103  }
104 
105  // We only need to check the "lower triangle".
106  assert( sc.clusters[i].distances.size() == i );
107  for( size_t j = 0; j < i; ++j ) {
108  if( ! sc.clusters[j].active ) {
109  continue;
110  }
111 
112  // Comparison.
113  #pragma omp flush( min_d )
114  if( sc.clusters[i].distances[j] < min_d ) {
115  #pragma omp critical( GENESIS_TREE_MASS_TREE_SQUASH_CLUSTERING_MIN_ENTRY_UPDATE )
116  {
117  if( sc.clusters[i].distances[j] < min_d ) {
118  min_i = i;
119  min_j = j;
120  min_d = sc.clusters[i].distances[j];
121  }
122  }
123  }
124  }
125  }
126 
127  // We return reverse order, so that i < j. This is just more intuitive to work with.
128  assert( min_i > min_j );
129  return { min_j, min_i };
130 }
131 
132 void squash_clustering_merge_clusters( SquashClustering& sc, size_t i, size_t j, double const p )
133 {
134  assert( i < j );
135  assert( i < sc.clusters.size() && j < sc.clusters.size() );
136  assert( i < sc.clusters[j].distances.size() );
137 
138  // Make new cluster.
139  sc.clusters.emplace_back();
140  auto& new_cluster = sc.clusters.back();
141 
142  // Make a new cluster tree as the weighted average of both given trees.
143  auto const weight_i = static_cast<double>( sc.clusters[i].count );
144  auto const weight_j = static_cast<double>( sc.clusters[j].count );
145  new_cluster.tree = mass_tree_merge_trees(
146  sc.clusters[i].tree, sc.clusters[j].tree, weight_i, weight_j
147  );
148  mass_tree_normalize_masses( new_cluster.tree );
149 
150  // Old way: Scale masses, merge them, scale back.
151  // Scale both trees according to their weight (= count).
152  // mass_tree_scale_masses( sc.clusters[i].tree, static_cast<double>( sc.clusters[i].count ));
153  // mass_tree_scale_masses( sc.clusters[j].tree, static_cast<double>( sc.clusters[j].count ));
154  // new_cluster.tree = mass_tree_merge_trees( sc.clusters[i].tree, sc.clusters[j].tree );
155  //
156  // mass_tree_normalize_masses( new_cluster.tree );
157  // mass_tree_scale_masses( sc.clusters[i].tree, 1.0 / static_cast<double>( sc.clusters[i].count ));
158  // mass_tree_scale_masses( sc.clusters[j].tree, 1.0 / static_cast<double>( sc.clusters[j].count ));
159 
160  // Set other properties of the new cluster.
161  new_cluster.count = sc.clusters[i].count + sc.clusters[j].count;
162  new_cluster.active = true;
163  new_cluster.distances.resize( sc.clusters.size() - 1, 0.0 );
164 
165  // Calculate distances to still active clusters, which also includes the two clusters that
166  // we are about to merge. We will deactivate them after the loop. This way, we also compute
167  // their distances in parallel, maximizing OpenMP throughput!
168  #pragma omp parallel for schedule(dynamic)
169  for( size_t k = 0; k < sc.clusters.size() - 1; ++k ) {
170  if( ! sc.clusters[k].active ) {
171  continue;
172  }
173 
174  auto const dist = earth_movers_distance( new_cluster.tree, sc.clusters[k].tree, p );
175  new_cluster.distances[k] = dist;
176  }
177 
178  // Get the distance between the two clusters that we want to merge,
179  // and make a new cluster merger.
180  sc.mergers.push_back({ i, new_cluster.distances[i], j, new_cluster.distances[j] });
181 
182  // Test. How much does the avg dist differ from proper emd dist?
183  // We need to access the distance in reverse j i order, because of the lower triangle.
184  // auto const inner_dist = sc.clusters[j].distances[i];
185  // auto const avg_dist_i = inner_dist / static_cast<double>( sc.clusters[i].count );
186  // auto const avg_dist_j = inner_dist / static_cast<double>( sc.clusters[j].count );
187 
188  // Deactive. Those two clusters are now merged.
189  sc.clusters[i].active = false;
190  sc.clusters[j].active = false;
191 
192  // We don't need the distances any more. Save mem.
193  sc.clusters[i].distances.clear();
194  sc.clusters[j].distances.clear();
195 
196  // We can also destroy those trees. For now. Maybe later, we want to write them first.
197  // sc.clusters[i].tree.clear();
198  // sc.clusters[j].tree.clear();
199 }
200 
201 SquashClustering squash_clustering( std::vector<MassTree>&& trees, double const p )
202 {
203  // Number of trees we are going to cluster.
204  auto const tree_count = trees.size();
205 
206  // Init the result object.
207  LOG_INFO << "Squash Clustering: Initialize";
208  SquashClustering sc = squash_clustering_init( std::move( trees ), p );
209 
210  // Do a full clustering, until only one is left.
211  for( size_t i = 0; i < tree_count - 1; ++i ) {
212  LOG_INFO << "Squash Clustering: Step " << (i+1) << " of " << tree_count - 1;
213 
214  auto const min_pair = squash_clustering_min_entry( sc );
215  assert( min_pair.first < min_pair.second );
216 
217  squash_clustering_merge_clusters( sc, min_pair.first, min_pair.second, p );
218  }
219 
220  // At the end, we only have one big cluster node.
221  assert( 1 == [&](){
222  size_t count_active = 0;
223  for( auto const& c : sc.clusters ) {
224  if( c.active ) {
225  ++count_active;
226  }
227  }
228  return count_active;
229  }() );
230 
231  // Furthermore, make sure we have created the right number of mergers and clusters.
232  assert( tree_count + sc.mergers.size() == sc.clusters.size() );
233 
234  return sc;
235 }
236 
237 std::string squash_cluster_tree( SquashClustering const& sc, std::vector<std::string> const& labels )
238 {
239  if( labels.size() != sc.clusters.size() - sc.mergers.size() ) {
240  throw std::runtime_error(
241  "List of labels does not have the correct size for the number of squash cluster elements."
242  );
243  }
244 
245  auto list = labels;
246  for( size_t i = 0; i < sc.mergers.size(); ++i ) {
247  auto const& cm = sc.mergers[i];
248 
249  auto node_a = list[ cm.index_a ] + ":" + std::to_string( cm.distance_a );
250  auto node_b = list[ cm.index_b ] + ":" + std::to_string( cm.distance_b );
251 
252  list[ cm.index_a ].clear();
253  list[ cm.index_b ].clear();
254 
255  list.push_back( "(" + node_a + "," + node_b + ")" + std::to_string( i + labels.size() ) );
256  }
257 
258  return list.back() + ";";
259 }
260 
261 } // namespace tree
262 } // namespace genesis
SquashClustering squash_clustering_init(std::vector< MassTree > &&trees, double const p)
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:300
Result structure for Squash Clustering.
std::pair< size_t, size_t > squash_clustering_min_entry(SquashClustering const &sc)
std::string squash_cluster_tree(SquashClustering const &sc, std::vector< std::string > const &labels)
Build a Newick-format tree for visualizing the result of a squash_clustering().
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
Provides easy and fast logging functionality.
SquashClustering squash_clustering(std::vector< MassTree > &&trees, double const p)
Perfom Squash Clustering.
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
void squash_clustering_merge_clusters(SquashClustering &sc, size_t i, size_t j, double const p)
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
Definition: labels.cpp:58
double earth_movers_distance(MassTree const &lhs, MassTree const &rhs, double const p)
Calculate the earth mover's distance of two distributions of masses on a given Tree.
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98