A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utils/math/kmeans.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_MATH_KMEANS_H_
2 #define GENESIS_UTILS_MATH_KMEANS_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2017 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cstddef>
41 #include <functional>
42 #include <limits>
43 #include <random>
44 #include <stdexcept>
45 #include <string>
46 #include <unordered_set>
47 #include <utility>
48 #include <vector>
49 
50 #ifdef GENESIS_OPENMP
51 # include <omp.h>
52 #endif
53 
54 namespace genesis {
55 namespace utils {
56 
57 // =================================================================================================
58 // Generic K-Means Class
59 // =================================================================================================
60 
64 template< typename Point >
65 class Kmeans
66 {
67 public:
68 
69  // -------------------------------------------------------------------------
70  // Typedefs and Constants
71  // -------------------------------------------------------------------------
72 
73  using value_type = Point;
74 
76  {
80  kNone
81  };
82 
88  {
89  std::vector<double> variances;
90  std::vector<size_t> counts;
91  std::vector<double> distances;
92  };
93 
94  // -------------------------------------------------------------------------
95  // Constructors and Rule of Five
96  // -------------------------------------------------------------------------
97 
98  Kmeans() = default;
99  virtual ~Kmeans() = default;
100 
101  Kmeans( Kmeans const& ) = default;
102  Kmeans( Kmeans&& ) = default;
103 
104  Kmeans& operator= ( Kmeans const& ) = default;
105  Kmeans& operator= ( Kmeans&& ) = default;
106 
107  // -------------------------------------------------------------------------
108  // Member Functions
109  // -------------------------------------------------------------------------
110 
111  size_t run( std::vector<Point> const& data, size_t const k )
112  {
113  // Run basic checks. This throws if necessary.
114  argument_checks_( data, k );
115 
116  // Init assigments and centroids.
117  initialize( data, k );
118 
119  // Call the hook.
120  pre_loop_hook( data, assignments_, centroids_ );
121 
122  // By now, the result vectors should be filled correctly.
123  // This replaces asserts. It is slightly more expensive, but this class offers so much
124  // expansion points and custom behaviour, that we better check thoroughly.
125  runtime_checks_( data, k );
126 
127  size_t iteration = 0;
128  bool changed_assigment;
129 
130  do {
131  // Start a new iteration.
132  LOG_INFO << "Iteration " << iteration;
133  changed_assigment = lloyd_step( data, assignments_, centroids_ );
134 
135  // Check again.
136  runtime_checks_( data, k );
137 
138  // Check if there are empty centroids, and if so, treat them.
139  auto const empty_centroids = get_empty_centroids_();
140  if( ! empty_centroids.empty() ) {
141  LOG_INFO << "Empty centroid occurred: " << empty_centroids.size();
142  changed_assigment |= treat_empty_centroids(
143  data, assignments_, centroids_, empty_centroids
144  );
145  }
146 
147  ++iteration;
148  } while( changed_assigment && iteration < max_iterations_ );
149 
150  // Call the hook.
151  post_loop_hook( data, assignments_, centroids_ );
152 
153  return iteration;
154  }
155 
156  // -------------------------------------------------------------------------
157  // Data Access
158  // -------------------------------------------------------------------------
159 
160  std::vector<size_t> const& assignments() const
161  {
162  return assignments_;
163  }
164 
165  Kmeans& assignments( std::vector<size_t> const& value )
166  {
167  assignments_ = value;
168  return *this;
169  }
170 
171  std::vector<Point> const& centroids() const
172  {
173  return centroids_;
174  }
175 
176  Kmeans& centroids( std::vector<Point> const& value )
177  {
178  centroids_ = value;
179  return *this;
180  }
181 
182  std::vector<size_t> cluster_sizes() const
183  {
184  if( assignments_.empty() || centroids_.empty() ) {
185  throw std::runtime_error( "No assignments or centroids set yet." );
186  }
187  auto result = std::vector<size_t>( centroids_.size(), 0 );
188  for( auto ass : assignments_ ) {
189  assert( ass <= result.size() );
190  ++result[ ass ];
191  }
192  return result;
193  }
194 
195  void clear()
196  {
197  assignments_.clear();
198  centroids_.clear();
199  }
200 
201  // -------------------------------------------------------------------------
202  // Properties
203  // -------------------------------------------------------------------------
204 
205  size_t max_iterations() const
206  {
207  return max_iterations_;
208  }
209 
210  Kmeans& max_iterations( size_t value )
211  {
212  if( value == 0 ) {
213  throw std::runtime_error( "Cannot use 0 as max_iterations for Kmeans." );
214  }
215  max_iterations_ = value;
216  return *this;
217  }
218 
220  {
221  return init_strategy_;
222  }
223 
225  {
226  init_strategy_ = value;
227  return *this;
228  }
229 
230  // -------------------------------------------------------------------------
231  // Virtual Functions
232  // -------------------------------------------------------------------------
233 
234 protected:
235 
236  virtual void initialize( std::vector<Point> const& data, size_t const k )
237  {
238  // Select init stragegies.
239  switch( init_strategy_ ) {
241  init_with_random_assignments_( data, k );
242  break;
243  }
245  init_with_random_centroids_( data, k );
246  break;
247  }
249  init_with_kmeans_plus_plus_( data, k );
250  break;
251  }
252  default: {}
253  }
254 
255  // If the strategy did not yield useful values, we still need to init somehow,
256  // so do this now. This also applies if kNone was selected for init, but no centroids were set.
257  if( assignments_.size() == 0 && centroids_.size() == 0 ) {
258  // Nothing given: Sample random centroids from the data.
259  init_with_random_centroids_( data, k );
260 
261  } else if( assignments_.size() == 0 && centroids_.size() > 0 ) {
262  // Centroids given, but no assigments: Nothing to do for now.
263  // We will calculate the proper assigments in the main loop.
264 
265  } else if( assignments_.size() > 0 && centroids_.size() == 0 ) {
266  // Assignments given, but not centroids: Caculate the latter.
267  update_centroids( data, assignments_, centroids_ );
268 
269  } else {
270  // Both given: Nothing to do.
271  assert( assignments_.size() > 0 && centroids_.size() > 0 );
272  }
273 
274  // If we do not have an assigment vector yet, make one. It will be assigned proper values
275  // once we enter the main loop.
276  if( assignments_.size() == 0 ) {
277  assignments_ = std::vector<size_t>( data.size(), 0 );
278  }
279  }
280 
281  virtual bool data_validation( std::vector<Point> const& data ) const
282  {
283  (void) data;
284  return true;
285  }
286 
287  virtual void pre_loop_hook(
288  std::vector<Point> const& data,
289  std::vector<size_t>& assignments,
290  std::vector<Point>& centroids
291  ) {
292  (void) data;
293  (void) assignments;
294  (void) centroids;
295  }
296 
297  virtual bool lloyd_step(
298  std::vector<Point> const& data,
299  std::vector<size_t>& assignments,
300  std::vector<Point>& centroids
301  ) {
302  // Calculate new assignments and check whether they changed.
303  auto changed_assigment = assign_to_centroids( data, centroids, assignments );
304 
305  // Recalculate the centroids.
306  update_centroids( data, assignments, centroids );
307 
308  return changed_assigment;
309  }
310 
311  virtual bool assign_to_centroids(
312  std::vector<Point> const& data,
313  std::vector<Point> const& centroids,
314  std::vector<size_t>& assignments
315  ) {
316  // Store whether anything changed.
317  bool changed_assigment = false;
318 
319  // Assign each Point to its nearest centroid.
320  #pragma omp parallel for
321  for( size_t i = 0; i < data.size(); ++i ) {
322  auto const new_idx = find_nearest_cluster( centroids, data[i] ).first;
323 
324  if( new_idx != assignments[i] ) {
325  // Update the assignment. No need for locking, as each thread works on its own i.
326  assignments[i] = new_idx;
327 
328  // If we have a new assigment for this datum, we need to do another loop iteration.
329  // Do this atomically, as all threads use this variable.
330  #pragma omp atomic write
331  changed_assigment = true;
332  }
333  }
334 
335  return changed_assigment;
336  }
337 
338  virtual std::pair<size_t, double> find_nearest_cluster(
339  std::vector<Point> const& centroids,
340  Point const& datum
341  ) const {
342  size_t min_i = std::numeric_limits<size_t>::max();
343  double min_d = std::numeric_limits<double>::max();
344 
345  assert( centroids.size() > 0 );
346  for( size_t i = 0; i < centroids.size(); ++i ) {
347  auto const dist = distance( datum, centroids[i] );
348  if( dist < min_d ) {
349  min_i = i;
350  min_d = dist;
351  }
352  }
353 
354  return { min_i, min_d };
355  }
356 
357  virtual ClusteringInfo cluster_info(
358  std::vector<Point> const& data,
359  std::vector<size_t> const& assignments,
360  std::vector<Point> const& centroids
361  ) const {
362  auto const k = centroids.size();
363 
364  auto result = ClusteringInfo();
365  result.variances = std::vector<double>( k, 0.0 );
366  result.counts = std::vector<size_t>( k, 0 );
367  result.distances = std::vector<double>( data.size(), 0.0 );
368 
369  // Work through the data and assigments and accumulate.
370  #pragma omp parallel for
371  for( size_t i = 0; i < data.size(); ++i ) {
372 
373  // Shorthands.
374  auto const a = assignments[ i ];
375  assert( a < k );
376  auto const& centroid = centroids[ a ];
377 
378  // Get dist from datum to centroid.
379  auto const dist = distance( centroid, data[ i ] );
380  result.distances[ i ] = dist;
381 
382  // Update centroid accumulators.
383  #pragma omp atomic update
384  result.variances[ a ] += dist * dist;
385  #pragma omp atomic update
386  ++result.counts[ a ];
387  }
388 
389  // Build the mean dist to get the variance for each centroid.
390  for( size_t i = 0; i < k; ++i ) {
391  if( result.counts[ i ] > 0 ) {
392  result.variances[ i ] /= result.counts[ i ];
393  }
394  }
395 
396  return result;
397  }
398 
399  virtual bool treat_empty_centroids(
400  std::vector<Point> const& data,
401  std::vector<size_t>& assignments,
402  std::vector<Point>& centroids,
403  std::unordered_set<size_t> const& empty_centroids
404  ) {
405  // If there are not empty centroids, we have nothing to do, and did not change anything,
406  // so return false.
407  if( empty_centroids.empty() ) {
408  return false;
409  }
410 
411  // Process all empty centroid indices.
412  for( auto const& ec_idx : empty_centroids ) {
413  // Get variances and counts of clusters and distances from data to them.
414  auto clus_info = cluster_info( data, assignments, centroids );
415  assert( clus_info.variances.size() == centroids.size() );
416  assert( clus_info.distances.size() == data.size() );
417  assert( data.size() == assignments.size() );
418 
419  // Get index of centroid with max variance.
420  auto const max_var_idx = static_cast<size_t>( std::distance(
421  clus_info.variances.begin(),
422  std::max_element( clus_info.variances.begin(), clus_info.variances.end() )
423  ));
424 
425  // If the max variance is 0, we cannot do anything. All points are the same.
426  if( clus_info.variances[ max_var_idx ] == 0.0 ) {
427  return false;
428  }
429 
430  // The empty centroid cannot be the same as the one we want to take a point from,
431  // because empty clusters have a variance of 0. If this assertion fails, we probably
432  // ran an analysis with k >> data.size() or so.
433  assert( ec_idx != max_var_idx );
434 
435  // The current empty cluster should actually be empty.
436  assert( clus_info.counts[ ec_idx ] == 0 );
437  assert( clus_info.variances[ ec_idx ] == 0.0 );
438 
439  // Find the point in the max var cluster that is furthest away from the centroid.
440  size_t furth_idx = std::numeric_limits<size_t>::max();
441  double furth_dist = std::numeric_limits<double>::lowest();
442  for( size_t i = 0; i < data.size(); ++i ) {
443  if( assignments[i] != max_var_idx ) {
444  continue;
445  }
446  if( clus_info.distances[ i ] > furth_dist ) {
447  furth_idx = i;
448  furth_dist = clus_info.distances[ i ];
449  }
450  }
451 
452  // The point needs to be part of the max var cluster, otherwise something went wrong.
453  assert( assignments[ furth_idx ] == max_var_idx );
454 
455  // Add the point to the empty cluster.
456  assignments[ furth_idx ] = ec_idx;
457 
458  // The following is some test code that could be used to avoid calculating the cluster
459  // info for each empty cluster. However, as we probably almost never run into this
460  // function anyway, we better keep it simple and calculate the info fresh every time.
461 
462  // // Adjust variance and count of the max var cluster.
463  // auto const dsqrt = clus_info.distances[ furth_idx ] * clus_info.distances[ furth_idx ];
464  // clus_info.variances[ max_var_idx ] *= clus_info.counts[ max_var_idx ];
465  // clus_info.variances[ max_var_idx ] -= dsqrt;
466  // --clus_info.counts[ max_var_idx ];
467  // clus_info.variances[ max_var_idx ] /= clus_info.counts[ max_var_idx ];
468  //
469  // // Adjust count of the (now not any longer) empty cluster.
470  // ++clus_info.counts[ ec_idx ];
471 
472  // Finally, we need to update the centroids in order to reflect the changes.
473  update_centroids( data, assignments, centroids );
474  }
475 
476  // Now we return true, because we changed some assignments.
477  return true;
478  }
479 
480  virtual double distance(
481  Point const& lhs,
482  Point const& rhs
483  ) const = 0;
484 
485  virtual void update_centroids(
486  std::vector<Point> const& data,
487  std::vector<size_t> const& assignments,
488  std::vector<Point>& centroids
489  ) = 0;
490 
491  virtual void post_loop_hook(
492  std::vector<Point> const& data,
493  std::vector<size_t>& assignments,
494  std::vector<Point>& centroids
495  ) {
496  (void) data;
497  (void) assignments;
498  (void) centroids;
499  }
500 
501  // -------------------------------------------------------------------------
502  // Internal Functions
503  // -------------------------------------------------------------------------
504 
505 private:
506 
507  void argument_checks_(
508  std::vector<Point> const& data,
509  size_t const k
510  ) const {
511  // Basic checks.
512  if( k > data.size() ) {
513  LOG_WARN << "Cannot run Kmeans with more clusters (k == " << k << ") than data points ("
514  << data.size() << ")";
515  }
516  if( k == 0 ) {
517  throw std::runtime_error(
518  "Cannot run Kmeans with zero clusters (k == 0)."
519  );
520  }
521 
522  // Validate the data. The function might also throw on its own, in order
523  // to provide a more helpful message about what is actually invalid about the data.
524  if( ! data_validation( data ) ) {
525  throw std::runtime_error( "Invalid data." );
526  }
527  }
528 
529  void runtime_checks_(
530  std::vector<Point> const& data,
531  size_t const k
532  ) const {
533  if( assignments_.size() != data.size() ) {
534  throw std::runtime_error(
535  "Assignments has size " + std::to_string( assignments_.size() ) +
536  " but data has size " + std::to_string( data.size() ) + "."
537  );
538  }
539  for( auto const& assign : assignments_ ) {
540  if( assign >= k ) {
541  throw std::runtime_error(
542  "Invalid assignment " + std::to_string( assign ) +
543  " >= k = " + std::to_string( k ) + "."
544  );
545  }
546  }
547  if( centroids_.size() != k ) {
548  throw std::runtime_error(
549  "Centroids has size " + std::to_string( centroids_.size() ) +
550  " but k is " + std::to_string( k ) + "."
551  );
552  }
553  }
554 
555  void init_with_random_assignments_(
556  std::vector<Point> const& data,
557  size_t const k
558  ) {
559  // Prepare a vector of the desired size.
560  assignments_ = std::vector<size_t>( data.size(), 0 );
561 
562  // Prepare a random distribution in range [0,k).
563  auto& engine = Options::get().random_engine();
564  std::uniform_int_distribution<size_t> distribution( 0, k - 1 );
565 
566  // Assign random cluster indices for each data point.
567  for( size_t i = 0; i < data.size(); ++i ) {
568  assignments_[ i ] = distribution( engine );
569  }
570  }
571 
572  void init_with_random_centroids_(
573  std::vector<Point> const& data,
574  size_t const k
575  ) {
576  // Prepare centroids vector. Empty, because we don't want to assume any default
577  // constructor for the Points.
578  centroids_ = std::vector<Point>();
579 
580  // Select k unique numbers out of the interval [ 0, data.size() ),
581  // and copy those data points to the centroids.
582  auto idxs = select_without_replacement( k, data.size() );
583  for( auto const idx : idxs ) {
584  centroids_.push_back( data[idx] );
585  }
586 
587  assert( centroids_.size() == k );
588  }
589 
590  void init_with_kmeans_plus_plus_(
591  std::vector<Point> const& data,
592  size_t const k
593  ) {
594  // Prepare centroids vector. Empty, because we don't want to assume any default
595  // constructor for the Points.
596  centroids_ = std::vector<Point>();
597 
598  // Shorthand.
599  auto& engine = Options::get().random_engine();
600 
601  // Use a random point as first centroid.
602  std::uniform_int_distribution<size_t> first_dist( 0, data.size() - 1 );
603  centroids_.push_back( data[ first_dist(engine) ]);
604 
605  // Prepare a vector of probabilities to select each data point as a centroid.
606  auto data_probs = std::vector<double>( data.size(), 0.0 );
607 
608  // Add more centroids.
609  for( size_t i = 1; i < k; ++i ) {
610 
611  // For each data point...
612  #pragma omp parallel for
613  for( size_t di = 0; di < data.size(); ++di ) {
614 
615  // ...find the closest centroid (of the ones that are produced so far), ...
616  double const min_d = find_nearest_cluster( centroids_, data[ di ] ).second;
617 
618  // ...and use its square as probability to select this point.
619  // (No need for OpenMP locking here, as di is unique to each thread).
620  data_probs[ di ] = min_d * min_d;
621  }
622 
623  // Now select a new centroid from the data, according to the given probabilities.
624  std::discrete_distribution<size_t> distribution(
625  data_probs.begin(), data_probs.end()
626  );
627  auto idx = distribution( engine );
628  assert( idx < data.size() );
629  centroids_.push_back( data[ idx ] );
630  }
631 
632  assert( centroids_.size() == k );
633  }
634 
635  std::unordered_set<size_t> get_empty_centroids_() {
636  auto const k = centroids_.size();
637 
638  // Fill a list with all numbers up to k...
639  auto empties = std::unordered_set<size_t>();
640  for( size_t i = 0; i < k; ++i ) {
641  empties.insert( i );
642  }
643 
644  // ... then remove all assigned ones again.
645  for( size_t i = 0; i < assignments_.size(); ++i ) {
646  assert( assignments_[i] < k );
647  empties.erase( assignments_[i] );
648 
649  // Prematurely exit if there is nothing else to remove.
650  // We don't want to go to all assignments if not necessary.
651  if( empties.empty() ) {
652  return empties;
653  }
654  }
655 
656  // If we are here, there are empty centroids, otherwise we'd have prematurely exited.
657  assert( ! empties.empty() );
658  return empties;
659  }
660 
661  // -------------------------------------------------------------------------
662  // Data Members
663  // -------------------------------------------------------------------------
664 
665 private:
666 
667  std::vector<size_t> assignments_;
668  std::vector<Point> centroids_;
669 
670  size_t max_iterations_ = 100;
672 
673 };
674 
675 // =================================================================================================
676 // Euclidean K-Means Specialization
677 // =================================================================================================
678 
680  : public Kmeans< std::vector<double> >
681 {
682 public:
683 
684  // -------------------------------------------------------------------------
685  // Typedefs and Constants
686  // -------------------------------------------------------------------------
687 
688  using Point = std::vector<double>;
689 
690  // -------------------------------------------------------------------------
691  // Constructors and Rule of Five
692  // -------------------------------------------------------------------------
693 
694  EuclideanKmeans( size_t dimensions );
695  virtual ~EuclideanKmeans() = default;
696 
697  EuclideanKmeans( EuclideanKmeans const& ) = default;
698  EuclideanKmeans( EuclideanKmeans&& ) = default;
699 
700  EuclideanKmeans& operator= ( EuclideanKmeans const& ) = default;
702 
703  // -------------------------------------------------------------------------
704  // Default K-Means Functions
705  // -------------------------------------------------------------------------
706 
707 private:
708 
709  virtual bool data_validation( std::vector<Point> const& data ) const override;
710 
711  virtual void update_centroids(
712  std::vector<Point> const& data,
713  std::vector<size_t> const& assignments,
714  std::vector<Point>& centroids
715  ) override;
716 
717  virtual double distance( Point const& lhs, Point const& rhs ) const override;
718 
719  // -------------------------------------------------------------------------
720  // Data Members
721  // -------------------------------------------------------------------------
722 
723 private:
724 
725  size_t dimensions_;
726 
727 };
728 
729 } // namespace utils
730 } // namespace genesis
731 
732 #endif // include guard
InitializationStrategy initialization_strategy() const
virtual bool treat_empty_centroids(std::vector< Point > const &data, std::vector< size_t > &assignments, std::vector< Point > &centroids, std::unordered_set< size_t > const &empty_centroids)
Kmeans & centroids(std::vector< Point > const &value)
virtual ~EuclideanKmeans()=default
#define LOG_WARN
Log a warning. See genesis::utils::LoggingLevel.
Definition: logging.hpp:95
size_t run(std::vector< Point > const &data, size_t const k)
std::vector< Point > const & centroids() const
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:300
EuclideanKmeans & operator=(EuclideanKmeans const &)=default
virtual void initialize(std::vector< Point > const &data, size_t const k)
virtual std::pair< size_t, double > find_nearest_cluster(std::vector< Point > const &centroids, Point const &datum) const
virtual void pre_loop_hook(std::vector< Point > const &data, std::vector< size_t > &assignments, std::vector< Point > &centroids)
virtual bool lloyd_step(std::vector< Point > const &data, std::vector< size_t > &assignments, std::vector< Point > &centroids)
virtual ~Kmeans()=default
size_t max_iterations() const
Kmeans & max_iterations(size_t value)
virtual bool assign_to_centroids(std::vector< Point > const &data, std::vector< Point > const &centroids, std::vector< size_t > &assignments)
Helper POD that stores the variances and number of data points of each centroid, as well as the dista...
Provides easy and fast logging functionality.
virtual ClusteringInfo cluster_info(std::vector< Point > const &data, std::vector< size_t > const &assignments, std::vector< Point > const &centroids) const
virtual bool data_validation(std::vector< Point > const &data) const
std::default_random_engine & random_engine()
Returns the default engine for random number generation.
Definition: options.hpp:151
Kmeans & operator=(Kmeans const &)=default
std::vector< size_t > select_without_replacement(size_t const k, size_t const n)
Select k many unique numbers out of the range [ 0, n ).
Definition: random.cpp:46
Kmeans & assignments(std::vector< size_t > const &value)
std::vector< size_t > cluster_sizes() const
virtual void post_loop_hook(std::vector< Point > const &data, std::vector< size_t > &assignments, std::vector< Point > &centroids)
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:59
virtual void update_centroids(std::vector< Point > const &data, std::vector< size_t > const &assignments, std::vector< Point > &centroids)=0
virtual double distance(Point const &lhs, Point const &rhs) const =0
std::vector< size_t > const & assignments() const
Kmeans & initialization_strategy(InitializationStrategy value)
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98