A library for working with phylogenetic and population genetic data.
v0.32.0
euclidean_kmeans.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2023 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
33 #include <algorithm>
34 #include <cmath>
35 
36 namespace genesis {
37 namespace utils {
38 
39 // ================================================================================================
40 // Euclidian K-Means Specialization
41 // ================================================================================================
42 
43 // -------------------------------------------------------------------------
44 // Constructors and Rule of Five
45 // -------------------------------------------------------------------------
46 
48  : dimensions_( dimensions )
49 {}
50 
51 // -------------------------------------------------------------------------
52 // Default K-Means Functions
53 // -------------------------------------------------------------------------
54 
55 bool EuclideanKmeans::data_validation( std::vector<Point> const& data ) const
56 {
57  for( size_t i = 0; i < data.size(); ++i ) {
58  auto const& datum = data[i];
59  if( datum.size() != dimensions_ ) {
60  throw std::runtime_error(
61  "Datum at " + std::to_string( i ) + " has invalid dimension " +
62  std::to_string( datum.size() ) + " instead of " + std::to_string( dimensions_ ) + "."
63  );
64  }
65  }
66  return true;
67 }
68 
69 void EuclideanKmeans::update_centroids(
70  std::vector<Point> const& data,
71  std::vector<size_t> const& assignments,
72  std::vector<Point>& centroids
73 ) {
74  // This function is only called from within the run() function, which already
75  // checks this condition. So, simply assert it here, instead of throwing.
76  assert( data.size() == assignments.size() );
77 
78  auto const k = centroids.size();
79 
80  #ifdef GENESIS_OPENMP
81 
82  // Parallelize over centroids
83  #pragma omp parallel for
84  for( size_t i = 0; i < k; ++i ) {
85 
86  // Thread-local Point
87  auto centroid = Point( dimensions_, 0.0 );
88  size_t count = 0;
89 
90  // Work through the data...
91  for( size_t j = 0; j < data.size(); ++j ) {
92 
93  // ... but only the relevant parts.
94  if( assignments[j] != i ) {
95  continue;
96  }
97 
98  // Accumulate centroid.
99  auto const& datum = data[ j ];
100  for( size_t d = 0; d < dimensions_; ++d ) {
101  centroid[ d ] += datum[ d ];
102  }
103  ++count;
104  }
105 
106  // Build the mean.
107  if( count > 0 ) {
108  for( size_t d = 0; d < dimensions_; ++d ) {
109  centroid[ d ] /= count;
110  }
111  }
112 
113  // Set the centroid. No need for locking, as we only access our i.
114  centroids[ i ] = std::move( centroid );
115  }
116 
117  #else
118 
119  // In the serial case, we only want to traverse the data once.
120 
121  // Init the result as well as counts for calculating the mean.
122  centroids = std::vector<Point>( k, Point( dimensions_, 0.0 ) );
123  auto counts = std::vector<size_t>( k, 0 );
124 
125  // Work through the data and assignments and accumulate.
126  for( size_t i = 0; i < data.size(); ++i ) {
127 
128  // Shorthands.
129  auto const& datum = data[ i ];
130  auto& centroid = centroids[ assignments[i] ];
131 
132  // Accumulate centroid.
133  for( size_t d = 0; d < dimensions_; ++d ) {
134  centroid[ d ] += datum[ d ];
135  }
136 
137  ++counts[ assignments[i] ];
138  }
139 
140  // Build the mean.
141  for( size_t i = 0; i < k; ++i ) {
142  if( counts[ i ] > 0 ) {
143  auto& centroid = centroids[ i ];
144  for( size_t d = 0; d < dimensions_; ++d ) {
145  centroid[ d ] /= counts[ i ];
146  }
147  }
148  }
149 
150  #endif
151 }
152 
153 double EuclideanKmeans::distance( Point const& lhs, Point const& rhs ) const
154 {
155  // Simple euclidean distance
156  double sum = 0.0;
157  for( size_t d = 0; d < dimensions_; ++d ) {
158  auto const diff = lhs[d] - rhs[d];
159  sum += diff * diff;
160  }
161  return std::sqrt( sum );
162 }
163 
164 } // namespace utils
165 } // namespace genesis
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::utils::EuclideanKmeans::EuclideanKmeans
EuclideanKmeans(size_t dimensions)
Definition: euclidean_kmeans.cpp:47
genesis::utils::Kmeans< std::vector< double > >::assignments
std::vector< size_t > const & assignments() const
Definition: utils/math/kmeans.hpp:174
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::EuclideanKmeans::Point
std::vector< double > Point
Definition: euclidean_kmeans.hpp:68
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::Kmeans< std::vector< double > >::centroids
std::vector< std::vector< double > > const & centroids() const
Definition: utils/math/kmeans.hpp:185
euclidean_kmeans.hpp