A library for working with phylogenetic and population genetic data.
v0.32.0
squash_clustering.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_TREE_MASS_TREE_SQUASH_CLUSTERING_H_
2 #define GENESIS_TREE_MASS_TREE_SQUASH_CLUSTERING_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
35 
36 #include <cstddef>
37 #include <functional>
38 #include <string>
39 #include <utility>
40 #include <vector>
41 
42 namespace genesis {
43 namespace tree {
44 
45 // =================================================================================================
46 // Squash Clustering
47 // =================================================================================================
48 
55 {
56 public:
57 
58  // -------------------------------------------------------------------------
59  // Typedefs and Constants
60  // -------------------------------------------------------------------------
61 
62  struct Cluster
63  {
71 
78  size_t count;
79 
85  bool active;
86 
95  std::vector<double> distances;
96  };
97 
98  struct Merger
99  {
103  size_t index_a;
104 
108  double distance_a;
109 
113  size_t index_b;
114 
118  double distance_b;
119  };
120 
121  // -------------------------------------------------------------------------
122  // Constructor and Rule of Five
123  // -------------------------------------------------------------------------
124 
125  SquashClustering() = default;
126  ~SquashClustering() = default;
127 
128  SquashClustering( SquashClustering const& ) = default;
129  SquashClustering( SquashClustering&& ) = default;
130 
131  SquashClustering& operator= ( SquashClustering const& ) = default;
133 
134  // -------------------------------------------------------------------------
135  // Public Functions
136  // -------------------------------------------------------------------------
137 
147  void run( std::vector<MassTree>&& trees );
148 
156  std::string tree_string( std::vector<std::string> const& labels ) const;
157 
158  SquashClustering& p( double value )
159  {
160  p_ = value;
161  return *this;
162  }
163 
164  double p() const
165  {
166  return p_;
167  }
168 
169  std::vector<Cluster> const& clusters() const
170  {
171  return clusters_;
172  }
173 
174  std::vector<Merger> const& mergers() const
175  {
176  return mergers_;
177  }
178 
182  void clear();
183 
184  // -------------------------------------------------------------------------
185  // Progress Report & Output
186  // -------------------------------------------------------------------------
187 
188  std::function<void( void )> report_initialization;
189  std::function<void( size_t i, size_t total )> report_step;
190 
191  std::function<void( MassTree const& cluster_tree, size_t index )> write_cluster_tree;
192 
193  // -------------------------------------------------------------------------
194  // Private Functions
195  // -------------------------------------------------------------------------
196 
197 private:
198 
199  void init_( std::vector<MassTree>&& trees );
200  std::pair<size_t, size_t> min_entry_() const;
201  void merge_clusters_( size_t i, size_t j );
202 
203  // -------------------------------------------------------------------------
204  // Data Members
205  // -------------------------------------------------------------------------
206 
207 private:
208 
209  double p_ = 1.0;
210 
211  std::vector<Cluster> clusters_;
212  std::vector<Merger> mergers_;
213 };
214 
215 } // namespace tree
216 } // namespace genesis
217 
218 #endif // include guard
genesis::tree::SquashClustering::clear
void clear()
Clear the clusters() and mergers() data.
Definition: squash_clustering.cpp:276
genesis::tree::SquashClustering::operator=
SquashClustering & operator=(SquashClustering const &)=default
genesis::tree::SquashClustering::write_cluster_tree
std::function< void(MassTree const &cluster_tree, size_t index)> write_cluster_tree
Definition: squash_clustering.hpp:191
genesis::tree::SquashClustering::clusters
std::vector< Cluster > const & clusters() const
Definition: squash_clustering.hpp:169
genesis::tree::SquashClustering::Cluster::active
bool active
Is this cluster active, i.e., is it not yet part of a larger cluster?
Definition: squash_clustering.hpp:85
genesis::tree::SquashClustering::SquashClustering
SquashClustering()=default
genesis::tree::SquashClustering::Merger
Definition: squash_clustering.hpp:98
genesis::tree::SquashClustering::report_step
std::function< void(size_t i, size_t total)> report_step
Definition: squash_clustering.hpp:189
genesis::tree::SquashClustering::tree_string
std::string tree_string(std::vector< std::string > const &labels) const
Build a Newick-format tree for visualizing the result of a squash_clustering().
Definition: squash_clustering.cpp:252
genesis::tree::SquashClustering::Cluster
Definition: squash_clustering.hpp:62
genesis::tree::SquashClustering::Merger::distance_a
double distance_a
Distance of the first data point to the cluster node.
Definition: squash_clustering.hpp:108
genesis::tree::SquashClustering::run
void run(std::vector< MassTree > &&trees)
Perfom Squash Clustering.
Definition: squash_clustering.cpp:212
tree.hpp
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
genesis::tree::SquashClustering::Cluster::tree
MassTree tree
The MassTree that this cluster represents.
Definition: squash_clustering.hpp:70
genesis::tree::SquashClustering::mergers
std::vector< Merger > const & mergers() const
Definition: squash_clustering.hpp:174
genesis::tree::SquashClustering::~SquashClustering
~SquashClustering()=default
genesis::sequence::labels
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
Definition: labels.cpp:64
genesis::tree::SquashClustering::Cluster::distances
std::vector< double > distances
Distances from this cluster to all clusters with a lower index in the clusters vector.
Definition: squash_clustering.hpp:95
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::tree::SquashClustering::p
double p() const
Definition: squash_clustering.hpp:164
genesis::tree::SquashClustering::Merger::index_b
size_t index_b
Index of the second data point in the cluster.
Definition: squash_clustering.hpp:113
genesis::tree::SquashClustering::Merger::distance_b
double distance_b
Distance of the second data point to the cluster node.
Definition: squash_clustering.hpp:118
genesis::tree::SquashClustering
Perform Squash Clustering.
Definition: squash_clustering.hpp:54
genesis::tree::SquashClustering::Merger::index_a
size_t index_a
Index of the first data point in the cluster.
Definition: squash_clustering.hpp:103
genesis::tree::SquashClustering::report_initialization
std::function< void(void)> report_initialization
Definition: squash_clustering.hpp:188
genesis::tree::SquashClustering::Cluster::count
size_t count
How many end points (Samples) does this cluster represent?
Definition: squash_clustering.hpp:78
genesis::tree::SquashClustering::p
SquashClustering & p(double value)
Definition: squash_clustering.hpp:158