56 void SquashClustering::init_( std::vector<MassTree>&& trees )
62 clusters_.resize( trees.size() );
63 for(
size_t i = 0; i < trees.size(); ++i ) {
64 clusters_[i].tree = std::move( trees[i] );
65 clusters_[i].count = 1;
66 clusters_[i].active =
true;
74 for(
size_t i = 0; i < clusters_.size(); ++i ) {
78 clusters_[i].distances.resize( i );
81 #pragma omp parallel for 82 for(
size_t k = 0; k < i; ++k ) {
84 clusters_[i].distances[k] = dist;
94 std::pair<size_t, size_t> SquashClustering::min_entry_()
const 99 double min_d = std::numeric_limits<double>::max();
102 #pragma omp parallel for schedule(dynamic) 103 for(
size_t i = 0; i < clusters_.size(); ++i ) {
104 if( ! clusters_[i].active ) {
109 assert( clusters_[i].distances.size() == i );
110 for(
size_t j = 0; j < i; ++j ) {
111 if( ! clusters_[j].active ) {
116 #pragma omp flush( min_d ) 117 if( clusters_[i].distances[j] < min_d ) {
118 #pragma omp critical( GENESIS_TREE_MASS_TREE_SQUASH_CLUSTERING_MIN_ENTRY_UPDATE ) 120 if( clusters_[i].distances[j] < min_d ) {
123 min_d = clusters_[i].distances[j];
131 assert( min_i > min_j );
132 return { min_j, min_i };
135 void SquashClustering::merge_clusters_(
size_t i,
size_t j )
138 assert( i < clusters_.size() && j < clusters_.size() );
139 assert( i < clusters_[j].distances.size() );
142 clusters_.emplace_back();
143 auto& new_cluster = clusters_.back();
146 auto const weight_i =
static_cast<double>( clusters_[i].count );
147 auto const weight_j =
static_cast<double>( clusters_[j].count );
149 clusters_[i].tree, clusters_[j].tree, weight_i, weight_j
156 assert( clusters_.size() > 0 );
171 new_cluster.count = clusters_[i].count + clusters_[j].count;
172 new_cluster.active =
true;
173 new_cluster.distances.resize( clusters_.size() - 1, 0.0 );
178 #pragma omp parallel for schedule(dynamic) 179 for(
size_t k = 0; k < clusters_.size() - 1; ++k ) {
180 if( ! clusters_[k].active ) {
185 new_cluster.distances[k] = dist;
190 mergers_.push_back({ i, new_cluster.distances[i], j, new_cluster.distances[j] });
199 clusters_[i].active =
false;
200 clusters_[j].active =
false;
203 clusters_[i].distances.clear();
204 clusters_[j].distances.clear();
207 clusters_[i].tree.clear();
208 clusters_[j].tree.clear();
214 auto const tree_count = trees.size();
221 init_( std::move( trees ) );
224 for(
size_t i = 0; i < tree_count - 1; ++i ) {
230 auto const min_pair = min_entry_();
231 assert( min_pair.first < min_pair.second );
233 merge_clusters_( min_pair.first, min_pair.second );
238 size_t count_active = 0;
239 for(
auto const& c : clusters_ ) {
248 assert( tree_count + mergers_.size() == clusters_.size() );
253 if( labels.size() != clusters_.size() - mergers_.size() ) {
254 throw std::runtime_error(
255 "List of labels does not have the correct size for the number of squash cluster elements." 260 for(
size_t i = 0; i < mergers_.size(); ++i ) {
261 auto const& cm = mergers_[i];
263 auto node_a = list[ cm.index_a ] +
":" +
std::to_string( cm.distance_a );
264 auto node_b = list[ cm.index_b ] +
":" +
std::to_string( cm.distance_b );
266 list[ cm.index_a ].clear();
267 list[ cm.index_b ].clear();
269 list.push_back(
"(" + node_a +
"," + node_b +
")" + std::to_string( i + labels.size() ) );
272 return list.back() +
";";
std::function< void(void)> report_initialization
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
double earth_movers_distance(MassTree const &lhs, MassTree const &rhs, double const p)
Calculate the earth mover's distance of two distributions of masses on a given Tree.
std::function< void(size_t i, size_t total)> report_step
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string tree_string(std::vector< std::string > const &labels) const
Build a Newick-format tree for visualizing the result of a squash_clustering().
std::function< void(MassTree const &cluster_tree, size_t index)> write_cluster_tree
Provides easy and fast logging functionality.
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
void run(std::vector< MassTree > &&trees)
Perfom Squash Clustering.
void clear()
Clear the clusters() and mergers() data.