A library for working with phylogenetic and population genetic data.
v0.27.0
pca.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <algorithm>
38 #include <cassert>
39 #include <cmath>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace utils {
44 
45 // ================================================================================================
46 // Reduce to Tridiagonal Matrix
47 // ================================================================================================
48 
50 {
51  if( data.empty() ) {
52  throw std::runtime_error(
53  "Cannot calculate reduce_to_tridiagonal_matrix() with an empty matrix."
54  );
55  }
56  if( data.rows() != data.cols() ) {
57  throw std::runtime_error(
58  "Expecting symmetrical matrix for reduce_to_tridiagonal_matrix()"
59  );
60  }
61 
62  // Init result.
64  tri.eigenvalues = std::vector<double>( data.cols() );
65  tri.intermediates = std::vector<double>( data.cols() );
66 
67  for( size_t i = data.cols()-1; i >= 1; --i ) {
68  size_t const l = i - 1;
69  double h = 0.0;
70 
71  if( l > 0 ) {
72  double scale = 0.0;
73 
74  for( size_t k = 0; k <= l; ++k ) {
75  scale += std::fabs(data( i, k ));
76  }
77 
78  if( scale == 0.0 ) {
79  tri.intermediates[i] = data( i, l );
80  } else {
81  for( size_t k = 0; k <= l; ++k ) {
82  data( i, k ) /= scale;
83  h += data( i, k ) * data( i, k );
84  }
85 
86  double f = data( i, l );
87  double g = ( f > 0 ? -sqrt(h) : sqrt(h) );
88  tri.intermediates[i] = scale * g;
89  h -= f * g;
90  data( i, l ) = f - g;
91  f = 0.0;
92 
93  for( size_t j = 0; j <= l; ++j ) {
94  data( j, i ) = data( i, j )/h;
95  g = 0.0;
96 
97  for( size_t k = 0; k <= j; ++k ) {
98  g += data( j, k ) * data( i, k );
99  }
100  for( size_t k = j+1; k <= l; ++k ) {
101  g += data( k, j ) * data( i, k );
102  }
103 
104  tri.intermediates[j] = g / h;
105  f += tri.intermediates[j] * data( i, j );
106  }
107 
108  double hh = f / (h + h);
109  for( size_t j = 0; j <= l; ++j ) {
110  f = data( i, j );
111  tri.intermediates[j] = g = tri.intermediates[j] - hh * f;
112  for( size_t k = 0; k <= j; ++k ) {
113  data( j, k ) -= (f * tri.intermediates[k] + g * data( i, k ));
114  }
115  }
116  }
117  } else {
118  tri.intermediates[i] = data( i, l );
119  }
120 
121  tri.eigenvalues[i] = h;
122  }
123 
124  tri.eigenvalues[0] = 0.0;
125  tri.intermediates[0] = 0.0;
126 
127  for( size_t i = 0; i < data.cols(); ++i ) {
128  if( i > 0 && tri.eigenvalues[i] != 0.0 ) {
129  size_t const l = i - 1;
130  for( size_t j = 0; j <= l; ++j ) {
131  double g = 0.0;
132  for( size_t k = 0; k <= l; ++k ) {
133  g += data( i, k ) * data( k, j );
134  }
135  for( size_t k = 0; k <= l; ++k ) {
136  data( k, j ) -= g * data( k, i );
137  }
138  }
139  }
140 
141  tri.eigenvalues[i] = data( i, i );
142  data( i, i ) = 1.0;
143 
144  if( i > 0 ) {
145  size_t const l = i - 1;
146  for( size_t j = 0; j <= l; ++j ) {
147  data( i, j ) = 0.0;
148  data( j, i ) = 0.0;
149  }
150  }
151  }
152 
153  return tri;
154 }
155 
156 // ================================================================================================
157 // Tridiagonal QL Algorithm
158 // ================================================================================================
159 
161  Matrix<double>& data,
163  size_t max_iterations
164 ) {
165  if( data.empty() ) {
166  throw std::runtime_error(
167  "Cannot calculate tridiagonal_ql_algorithm() with an empty matrix."
168  );
169  }
170  if( data.rows() != data.cols() ) {
171  throw std::runtime_error(
172  "Expecting symmetrical matrix for tridiagonal_ql_algorithm()"
173  );
174  }
175  if( tri.eigenvalues.size() != data.cols() || tri.intermediates.size() != data.cols() ) {
176  throw std::runtime_error(
177  "Expecting TridiagonalDecompositionData vectors of the same size"
178  " as the data matrix in tridiagonal_ql_algorithm()"
179  );
180  }
181 
182  // Some shorthands.
183  auto& d = tri.eigenvalues;
184  auto& e = tri.intermediates;
185  size_t const n = data.rows();
186 
187  for( size_t i = 1; i < n; ++i ) {
188  e[i-1] = e[i];
189  }
190  assert( n > 0 );
191  e[n-1] = 0.0;
192 
193  for( size_t l = 0; l < n; ++l ) {
194  size_t iter = 0;
195  size_t m = 0;
196 
197  do {
198  for( m = l; m < n-1; ++m ) {
199  double const dd = std::fabs( d[m]) + std::fabs(d[m+1] );
200  if( std::fabs(e[m]) + dd == dd ) {
201  break;
202  }
203  }
204 
205  if( m != l ) {
206  if( max_iterations > 0 && iter == max_iterations ) {
207  throw std::runtime_error( "No convergence in tridiagonal_ql_algorithm()." );
208  }
209  ++iter;
210 
211  assert( l < n - 1 );
212  double s = 1.0;
213  double c = 1.0;
214  double p = 0.0;
215  double g = (d[l+1] - d[l]) / (2.0 * e[l]);
216  double r = sqrt((g * g) + 1.0);
217  auto sign = ( (g) < 0 ? -std::fabs(r) : std::fabs(r) );
218  g = d[m] - d[l] + e[l] / (g + sign);
219 
220  for( long i = static_cast<long>( m ) - 1; i >= static_cast<long>( l ); --i ) {
221  double f = s * e[i];
222  double b = c * e[i];
223 
224  if( std::fabs(f) >= std::fabs(g) ) {
225  c = g / f;
226  r = sqrt((c * c) + 1.0);
227  e[i+1] = f * r;
228  c *= (s = 1.0/r);
229  } else {
230  s = f / g;
231  r = sqrt((s * s) + 1.0);
232  e[i+1] = g * r;
233  s *= (c = 1.0/r);
234  }
235 
236  g = d[i+1] - p;
237  r = (d[i] - g) * s + 2.0 * c * b;
238  p = s * r;
239  d[i+1] = g + p;
240  g = c * r - b;
241  for( size_t k = 0; k < n; ++k ) {
242  f = data( k, i+1 );
243  data( k, i+1 ) = s * data( k, i ) + c * f;
244  data( k, i ) = c * data( k, i ) - s * f;
245  }
246  }
247 
248  d[l] = d[l] - p;
249  e[l] = g;
250  e[m] = 0.0;
251  }
252  } while (m != l);
253  }
254 }
255 
256 // ================================================================================================
257 // Principal Component Analysis
258 // ================================================================================================
259 
261  Matrix<double> const& data,
262  size_t components,
263  PcaStandardization standardization
264 ) {
265  // Prepare result struct.
266  PcaData result;
267 
268  // Normalize data and get correlation/covariance matrix.
269  // We manually run the normalization step here, because we need the normalized data later.
270  // The resulting `symmat` is then then same as if after performing
271  // correlation_matrix / covariance_matrix / sums_of_squares_and_cross_products_matrix
272  // (depending on the settings for the standardization).
273  // The `symmat` is later overwritten by the tridiagonal decomposition algorithm and then
274  // contains the eigenvectors.
275  auto standardized_data = data;
276  if( standardization == PcaStandardization::kCorrelation ) {
277  standardize_cols( standardized_data, true, true );
278  } else if( standardization == PcaStandardization::kCovariance ) {
279  standardize_cols( standardized_data, true, false );
280  }
281  auto symmat = sums_of_squares_and_cross_products_matrix( standardized_data );
282  for( auto& elem : symmat ) {
283  elem /= static_cast<double>( standardized_data.rows() );
284  }
285 
286  // As symmaat is later overwritten, we store the correlation/covariance for later.
287  // auto covar = symmat;
288 
289  // Get number of desired PCA components.
290  if( components == 0 ) {
291  components = standardized_data.cols();
292  }
293  if( components > standardized_data.cols() ) {
294  throw std::runtime_error(
295  "Cannot calculate more PCA components than the original data has columns."
296  );
297  }
298 
299  // Eigenvalue Decompostion.
300  auto tri = reduce_to_tridiagonal_matrix( symmat );
301  tridiagonal_ql_algorithm( symmat, tri );
302 
303  // Some checks.
304  assert( tri.eigenvalues.size() == standardized_data.cols() );
305  assert( tri.intermediates.size() == standardized_data.cols() );
306  assert( symmat.rows() == standardized_data.cols() );
307  assert( symmat.cols() == standardized_data.cols() );
308 
309  // Sort Eigenvalues and Eigenvectors and store in result struct.
310  auto sorted_indices = sort_indices(
311  tri.eigenvalues.begin(),
312  tri.eigenvalues.end(),
313  std::greater<double>()
314  );
315  result.eigenvalues = std::vector<double>( components );
316  result.eigenvectors = Matrix<double>( symmat.rows(), components );
317  for( size_t c = 0; c < components; ++c ) {
318  result.eigenvalues[c] = tri.eigenvalues[ sorted_indices[c] ];
319 
320  for( size_t r = 0; r < symmat.rows(); ++r ) {
321  result.eigenvectors( r, c ) = symmat( r, sorted_indices[c] );
322  }
323  }
324 
325  // Store projections of row-points on pricipal components into result.
326  // This is a simple matrix multiplication of the normalized data with the eigenvectors.
327  result.projection = matrix_multiplication( standardized_data, result.eigenvectors );
328 
329  // Experimental: Calculate column projection.
330  // See original implememtation for code:
331  // https://github.com/davidhalter-archive/ardour/blob/master/libs/qm-dsp/maths/pca/pca.c
332  //
333  // auto col_proj = Matrix<double>( data.cols(), data.cols() );
334  // /* Form projections of col.-points on first three prin. components. */
335  // /* Store in 'covar', overwriting what was stored in this. */
336  // for( size_t j = 0; j < data.cols(); ++j ) {
337  // for( size_t i = 0; i < 3; ++i ) {
338  // col_proj(j,i) = 0.0;
339  // for( size_t k2 = 0; k2 < data.cols(); ++k2 ) {
340  // col_proj(j,i) += covar(j,k2) * result.eigenvectors( k2, data.cols()-i-1 );
341  // }
342  // if( result.eigenvalues[data.cols()-i-1] > 0.0005 ) {
343  // /* Guard against zero eigenvalue */
344  // col_proj(j,i) /= sqrt(result.eigenvalues[data.cols()-i-1]); /* Rescale */
345  // } else {
346  // col_proj(j,i) = 0.0; /* Standard kludge */
347  // }
348  // }
349  // }
350  //
351  // printf("\nProjections of column-points on first 3 prin. comps.:\n");
352  // for (size_t j = 0; j < data.cols(); j++) {
353  // for ( size_t k = 0; k < 3; k++) {
354  // printf("%12.4f", col_proj(j,k));
355  // }
356  // printf("\n");
357  // }
358 
359  return result;
360 }
361 
362 } // namespace utils
363 } // namespace genesis
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
genesis::utils::Matrix::empty
bool empty() const
Definition: containers/matrix.hpp:177
genesis::utils::PcaData::eigenvalues
std::vector< double > eigenvalues
Definition: pca.hpp:97
genesis::utils::Matrix::cols
size_t cols() const
Definition: containers/matrix.hpp:167
genesis::utils::TridiagonalDecompositionData
Helper structure used for the eigenvalue decomposition in reduce_to_tridiagonal_matrix() and tridiago...
Definition: pca.hpp:86
genesis::utils::standardize_cols
std::vector< MeanStddevPair > standardize_cols(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the columns of a Matrix by subtracting the mean and scaling to unit variance.
Definition: math/matrix.cpp:94
pca.hpp
genesis::utils::TridiagonalDecompositionData::eigenvalues
std::vector< double > eigenvalues
Definition: pca.hpp:88
genesis::utils::sort_indices
std::vector< size_t > sort_indices(RandomAccessIterator first, RandomAccessIterator last, Comparator comparator)
Get the indices to the sorted order of the given range.
Definition: algorithm.hpp:182
genesis::utils::scale
void scale(Histogram &h, double factor)
Definition: operations.cpp:54
genesis::utils::Matrix< double >
genesis::utils::PcaStandardization::kCovariance
@ kCovariance
Standardize the mean, but not the variance of the data before performing the PCA.
matrix.hpp
genesis::utils::sums_of_squares_and_cross_products_matrix
Matrix< double > sums_of_squares_and_cross_products_matrix(Matrix< double > const &data)
Calculate the Sums of Squares and Cross Products Matrix (SSCP Matrix).
Definition: math/matrix.cpp:225
genesis::utils::PcaData::projection
Matrix< double > projection
Definition: pca.hpp:99
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::principal_component_analysis
PcaData principal_component_analysis(Matrix< double > const &data, size_t components, PcaStandardization standardization)
Perfom a Principal Component Analysis on a given data Matrix.
Definition: pca.cpp:260
genesis::utils::TridiagonalDecompositionData::intermediates
std::vector< double > intermediates
Definition: pca.hpp:89
genesis::utils::PcaData::eigenvectors
Matrix< double > eigenvectors
Definition: pca.hpp:98
genesis::utils::reduce_to_tridiagonal_matrix
TridiagonalDecompositionData reduce_to_tridiagonal_matrix(Matrix< double > &data)
Triangular decomposition of a symmetric matrix.
Definition: pca.cpp:49
genesis::utils::PcaData
Helper stucture that collects the output of principal_component_analysis().
Definition: pca.hpp:95
operators.hpp
Matrix operators.
genesis::utils::matrix_multiplication
Matrix< T > matrix_multiplication(Matrix< A > const &a, Matrix< B > const &b)
Calculate the product of two Matrices.
Definition: math/matrix.hpp:548
genesis::utils::PcaStandardization
PcaStandardization
Setting for principal_component_analysis() to determine which form of standardization of the data to ...
Definition: pca.hpp:49
genesis::utils::Matrix::rows
size_t rows() const
Definition: containers/matrix.hpp:162
genesis::utils::PcaStandardization::kCorrelation
@ kCorrelation
Standardize the mean and variance of the data before performing the PCA.
genesis::utils::tridiagonal_ql_algorithm
void tridiagonal_ql_algorithm(Matrix< double > &data, TridiagonalDecompositionData &tri, size_t max_iterations)
Reduce a symmetric matrix to a symmetric tridiagonal matrix.
Definition: pca.cpp:160