56 void SquashClustering::init_( std::vector<MassTree>&& trees )
62 clusters_.resize( trees.size() );
63 for(
size_t i = 0; i < trees.size(); ++i ) {
64 clusters_[i].tree = std::move( trees[i] );
65 clusters_[i].count = 1;
66 clusters_[i].active =
true;
74 for(
size_t i = 0; i < clusters_.size(); ++i ) {
78 clusters_[i].distances.resize( i );
81 #pragma omp parallel for
82 for(
size_t k = 0; k < i; ++k ) {
84 clusters_[i].distances[k] = dist;
89 std::pair<size_t, size_t> SquashClustering::min_entry_()
const
94 double min_d = std::numeric_limits<double>::max();
97 #pragma omp parallel for schedule(dynamic)
98 for(
size_t i = 0; i < clusters_.size(); ++i ) {
99 if( ! clusters_[i].active ) {
104 assert( clusters_[i].distances.size() == i );
105 for(
size_t j = 0; j < i; ++j ) {
106 if( ! clusters_[j].active ) {
111 #pragma omp flush( min_d )
112 if( clusters_[i].distances[j] < min_d ) {
113 #pragma omp critical( GENESIS_TREE_MASS_TREE_SQUASH_CLUSTERING_MIN_ENTRY_UPDATE )
115 if( clusters_[i].distances[j] < min_d ) {
118 min_d = clusters_[i].distances[j];
126 assert( min_i > min_j );
127 return { min_j, min_i };
130 void SquashClustering::merge_clusters_(
size_t i,
size_t j )
133 assert( i < clusters_.size() && j < clusters_.size() );
134 assert( i < clusters_[j].distances.size() );
137 clusters_.emplace_back();
138 auto& new_cluster = clusters_.back();
141 auto const weight_i =
static_cast<double>( clusters_[i].count );
142 auto const weight_j =
static_cast<double>( clusters_[j].count );
144 clusters_[i].tree, clusters_[j].tree, weight_i, weight_j
159 new_cluster.count = clusters_[i].count + clusters_[j].count;
160 new_cluster.active =
true;
161 new_cluster.distances.resize( clusters_.size() - 1, 0.0 );
166 #pragma omp parallel for schedule(dynamic)
167 for(
size_t k = 0; k < clusters_.size() - 1; ++k ) {
168 if( ! clusters_[k].active ) {
173 new_cluster.distances[k] = dist;
178 mergers_.push_back({ i, new_cluster.distances[i], j, new_cluster.distances[j] });
187 clusters_[i].active =
false;
188 clusters_[j].active =
false;
191 clusters_[i].distances.clear();
192 clusters_[j].distances.clear();
202 auto const tree_count = trees.size();
209 init_( std::move( trees ) );
212 for(
size_t i = 0; i < tree_count - 1; ++i ) {
218 auto const min_pair = min_entry_();
219 assert( min_pair.first < min_pair.second );
221 merge_clusters_( min_pair.first, min_pair.second );
226 size_t count_active = 0;
227 for(
auto const& c : clusters_ ) {
236 assert( tree_count + mergers_.size() == clusters_.size() );
239 std::string SquashClustering::tree_string( std::vector<std::string>
const&
labels )
const
241 if( labels.size() != clusters_.size() - mergers_.size() ) {
242 throw std::runtime_error(
243 "List of labels does not have the correct size for the number of squash cluster elements."
248 for(
size_t i = 0; i < mergers_.size(); ++i ) {
249 auto const& cm = mergers_[i];
251 auto node_a = list[ cm.index_a ] +
":" +
std::to_string( cm.distance_a );
252 auto node_b = list[ cm.index_b ] +
":" +
std::to_string( cm.distance_b );
254 list[ cm.index_a ].clear();
255 list[ cm.index_b ].clear();
257 list.push_back(
"(" + node_a +
"," + node_b +
")" + std::to_string( i + labels.size() ) );
260 return list.back() +
";";
263 void SquashClustering::clear()
std::string to_string(T const &v)
Return a string representation of a given value.
std::function< std::string(void)> report_initialization
MassTree mass_tree_merge_trees(MassTree const &lhs, MassTree const &rhs, double const scaler_lhs, double const scaler_rhs)
Merge all masses of two MassTrees into one and return it.
Provides easy and fast logging functionality.
void mass_tree_normalize_masses(MassTree &tree)
Scale all masses of a MassTree so that they sum up to 1.0.
std::function< std::string(size_t i, size_t total)> report_step
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
void run(std::vector< MassTree > &&trees)
Perfom Squash Clustering.
void clear()
Clear the clusters() and mergers() data.
double earth_movers_distance(MassTree const &lhs, MassTree const &rhs, double const p)
Calculate the earth mover's distance of two distributions of masses on a given Tree.