A toolkit for working with phylogenetic data.
v0.24.0
pca.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <algorithm>
38 #include <cassert>
39 #include <cmath>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace utils {
44 
45 // ================================================================================================
46 // Reduce to Tridiagonal Matrix
47 // ================================================================================================
48 
50 {
51  if( data.empty() ) {
52  throw std::runtime_error(
53  "Cannot calculate reduce_to_tridiagonal_matrix() with an empty matrix."
54  );
55  }
56  if( data.rows() != data.cols() ) {
57  throw std::runtime_error(
58  "Expecting symmetrical matrix for reduce_to_tridiagonal_matrix()"
59  );
60  }
61 
62  // Init result.
64  tri.eigenvalues = std::vector<double>( data.cols() );
65  tri.intermediates = std::vector<double>( data.cols() );
66 
67  for( size_t i = data.cols()-1; i >= 1; --i ) {
68  size_t const l = i - 1;
69  double h = 0.0;
70 
71  if( l > 0 ) {
72  double scale = 0.0;
73 
74  for( size_t k = 0; k <= l; ++k ) {
75  scale += std::fabs(data( i, k ));
76  }
77 
78  if( scale == 0.0 ) {
79  tri.intermediates[i] = data( i, l );
80  } else {
81  for( size_t k = 0; k <= l; ++k ) {
82  data( i, k ) /= scale;
83  h += data( i, k ) * data( i, k );
84  }
85 
86  double f = data( i, l );
87  double g = ( f > 0 ? -sqrt(h) : sqrt(h) );
88  tri.intermediates[i] = scale * g;
89  h -= f * g;
90  data( i, l ) = f - g;
91  f = 0.0;
92 
93  for( size_t j = 0; j <= l; ++j ) {
94  data( j, i ) = data( i, j )/h;
95  g = 0.0;
96 
97  for( size_t k = 0; k <= j; ++k ) {
98  g += data( j, k ) * data( i, k );
99  }
100  for( size_t k = j+1; k <= l; ++k ) {
101  g += data( k, j ) * data( i, k );
102  }
103 
104  tri.intermediates[j] = g / h;
105  f += tri.intermediates[j] * data( i, j );
106  }
107 
108  double hh = f / (h + h);
109  for( size_t j = 0; j <= l; ++j ) {
110  f = data( i, j );
111  tri.intermediates[j] = g = tri.intermediates[j] - hh * f;
112  for( size_t k = 0; k <= j; ++k ) {
113  data( j, k ) -= (f * tri.intermediates[k] + g * data( i, k ));
114  }
115  }
116  }
117  } else {
118  tri.intermediates[i] = data( i, l );
119  }
120 
121  tri.eigenvalues[i] = h;
122  }
123 
124  tri.eigenvalues[0] = 0.0;
125  tri.intermediates[0] = 0.0;
126 
127  for( size_t i = 0; i < data.cols(); ++i ) {
128  if( i > 0 && tri.eigenvalues[i] != 0.0 ) {
129  size_t const l = i - 1;
130  for( size_t j = 0; j <= l; ++j ) {
131  double g = 0.0;
132  for( size_t k = 0; k <= l; ++k ) {
133  g += data( i, k ) * data( k, j );
134  }
135  for( size_t k = 0; k <= l; ++k ) {
136  data( k, j ) -= g * data( k, i );
137  }
138  }
139  }
140 
141  tri.eigenvalues[i] = data( i, i );
142  data( i, i ) = 1.0;
143 
144  if( i > 0 ) {
145  size_t const l = i - 1;
146  for( size_t j = 0; j <= l; ++j ) {
147  data( i, j ) = 0.0;
148  data( j, i ) = 0.0;
149  }
150  }
151  }
152 
153  return tri;
154 }
155 
156 // ================================================================================================
157 // Tridiagonal QL Algorithm
158 // ================================================================================================
159 
161  Matrix<double>& data,
163  size_t max_iterations
164 ) {
165  if( data.empty() ) {
166  throw std::runtime_error(
167  "Cannot calculate tridiagonal_ql_algorithm() with an empty matrix."
168  );
169  }
170  if( data.rows() != data.cols() ) {
171  throw std::runtime_error(
172  "Expecting symmetrical matrix for tridiagonal_ql_algorithm()"
173  );
174  }
175  if( tri.eigenvalues.size() != data.cols() || tri.intermediates.size() != data.cols() ) {
176  throw std::runtime_error(
177  "Expecting TridiagonalDecompositionData vectors of the same size"
178  " as the data matrix in tridiagonal_ql_algorithm()"
179  );
180  }
181 
182  // Some shorthands.
183  auto& d = tri.eigenvalues;
184  auto& e = tri.intermediates;
185  size_t const n = data.rows();
186 
187  for( size_t i = 1; i < n; ++i ) {
188  e[i-1] = e[i];
189  }
190  assert( n > 0 );
191  e[n-1] = 0.0;
192 
193  for( size_t l = 0; l < n; ++l ) {
194  size_t iter = 0;
195  size_t m = 0;
196 
197  do {
198  for( m = l; m < n-1; ++m ) {
199  double const dd = std::fabs( d[m]) + std::fabs(d[m+1] );
200  if( std::fabs(e[m]) + dd == dd ) {
201  break;
202  }
203  }
204 
205  if( m != l ) {
206  if( max_iterations > 0 && iter == max_iterations ) {
207  throw std::runtime_error( "No convergence in tridiagonal_ql_algorithm()." );
208  }
209  ++iter;
210 
211  assert( l < n - 1 );
212  double s = 1.0;
213  double c = 1.0;
214  double p = 0.0;
215  double g = (d[l+1] - d[l]) / (2.0 * e[l]);
216  double r = sqrt((g * g) + 1.0);
217  auto sign = ( (g) < 0 ? -std::fabs(r) : std::fabs(r) );
218  g = d[m] - d[l] + e[l] / (g + sign);
219 
220  for( long i = static_cast<long>( m ) - 1; i >= static_cast<long>( l ); --i ) {
221  double f = s * e[i];
222  double b = c * e[i];
223 
224  if( std::fabs(f) >= std::fabs(g) ) {
225  c = g / f;
226  r = sqrt((c * c) + 1.0);
227  e[i+1] = f * r;
228  c *= (s = 1.0/r);
229  } else {
230  s = f / g;
231  r = sqrt((s * s) + 1.0);
232  e[i+1] = g * r;
233  s *= (c = 1.0/r);
234  }
235 
236  g = d[i+1] - p;
237  r = (d[i] - g) * s + 2.0 * c * b;
238  p = s * r;
239  d[i+1] = g + p;
240  g = c * r - b;
241  for( size_t k = 0; k < n; ++k ) {
242  f = data( k, i+1 );
243  data( k, i+1 ) = s * data( k, i ) + c * f;
244  data( k, i ) = c * data( k, i ) - s * f;
245  }
246  }
247 
248  d[l] = d[l] - p;
249  e[l] = g;
250  e[m] = 0.0;
251  }
252  } while (m != l);
253  }
254 }
255 
256 // ================================================================================================
257 // Principal Component Analysis
258 // ================================================================================================
259 
261  Matrix<double> const& data,
262  size_t components,
263  PcaStandardization standardization
264 ) {
265  // Prepare result struct.
266  PcaData result;
267 
268  // Normalize data and get correlation/covariance matrix.
269  // We manually run the normalization step here, because we need the normalized data later.
270  // The resulting `symmat` is then then same as if after performing
271  // correlation_matrix / covariance_matrix / sums_of_squares_and_cross_products_matrix
272  // (depending on the settings for the standardization).
273  // The `symmat` is later overwritten by the tridiagonal decomposition algorithm and then
274  // contains the eigenvectors.
275  auto standardized_data = data;
276  if( standardization == PcaStandardization::kCorrelation ) {
277  standardize_cols( standardized_data, true, true );
278  } else if( standardization == PcaStandardization::kCovariance ) {
279  standardize_cols( standardized_data, true, false );
280  }
281  auto symmat = sums_of_squares_and_cross_products_matrix( standardized_data );
282  for( auto& elem : symmat ) {
283  elem /= static_cast<double>( standardized_data.rows() );
284  }
285 
286  // As symmaat is later overwritten, we store the correlation/covariance for later.
287  // auto covar = symmat;
288 
289  // Get number of desired PCA components.
290  if( components == 0 ) {
291  components = standardized_data.cols();
292  }
293  if( components > standardized_data.cols() ) {
294  throw std::runtime_error(
295  "Cannot calculate more PCA components than the original data has columns."
296  );
297  }
298 
299  // Eigenvalue Decompostion.
300  auto tri = reduce_to_tridiagonal_matrix( symmat );
301  tridiagonal_ql_algorithm( symmat, tri );
302 
303  // Some checks.
304  assert( tri.eigenvalues.size() == standardized_data.cols() );
305  assert( tri.intermediates.size() == standardized_data.cols() );
306  assert( symmat.rows() == standardized_data.cols() );
307  assert( symmat.cols() == standardized_data.cols() );
308 
309  // Sort Eigenvalues and Eigenvectors and store in result struct.
310  auto sorted_indices = sort_indices(
311  tri.eigenvalues.begin(),
312  tri.eigenvalues.end(),
313  std::greater<double>()
314  );
315  result.eigenvalues = std::vector<double>( components );
316  result.eigenvectors = Matrix<double>( symmat.rows(), components );
317  for( size_t c = 0; c < components; ++c ) {
318  result.eigenvalues[c] = tri.eigenvalues[ sorted_indices[c] ];
319 
320  for( size_t r = 0; r < symmat.rows(); ++r ) {
321  result.eigenvectors( r, c ) = symmat( r, sorted_indices[c] );
322  }
323  }
324 
325  // Store projections of row-points on pricipal components into result.
326  // This is a simple matrix multiplication of the normalized data with the eigenvectors.
327  result.projection = matrix_multiplication( standardized_data, result.eigenvectors );
328 
329  // Experimental: Calculate column projection.
330  // See original implememtation for code:
331  // https://github.com/davidhalter-archive/ardour/blob/master/libs/qm-dsp/maths/pca/pca.c
332  //
333  // auto col_proj = Matrix<double>( data.cols(), data.cols() );
334  // /* Form projections of col.-points on first three prin. components. */
335  // /* Store in 'covar', overwriting what was stored in this. */
336  // for( size_t j = 0; j < data.cols(); ++j ) {
337  // for( size_t i = 0; i < 3; ++i ) {
338  // col_proj(j,i) = 0.0;
339  // for( size_t k2 = 0; k2 < data.cols(); ++k2 ) {
340  // col_proj(j,i) += covar(j,k2) * result.eigenvectors( k2, data.cols()-i-1 );
341  // }
342  // if( result.eigenvalues[data.cols()-i-1] > 0.0005 ) {
343  // /* Guard against zero eigenvalue */
344  // col_proj(j,i) /= sqrt(result.eigenvalues[data.cols()-i-1]); /* Rescale */
345  // } else {
346  // col_proj(j,i) = 0.0; /* Standard kludge */
347  // }
348  // }
349  // }
350  //
351  // printf("\nProjections of column-points on first 3 prin. comps.:\n");
352  // for (size_t j = 0; j < data.cols(); j++) {
353  // for ( size_t k = 0; k < 3; k++) {
354  // printf("%12.4f", col_proj(j,k));
355  // }
356  // printf("\n");
357  // }
358 
359  return result;
360 }
361 
362 } // namespace utils
363 } // namespace genesis
Matrix< T > matrix_multiplication(Matrix< A > const &a, Matrix< B > const &b)
Calculate the product of two Matrices.
Provides some valuable algorithms that are not part of the C++ 11 STL.
std::vector< double > eigenvalues
Definition: pca.hpp:97
Matrix< double > sums_of_squares_and_cross_products_matrix(Matrix< double > const &data)
Calculate the Sums of Squares and Cross Products Matrix (SSCP Matrix).
std::vector< double > intermediates
Definition: pca.hpp:89
Standardize the mean, but not the variance of the data before performing the PCA. ...
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
Helper stucture that collects the output of principal_component_analysis().
Definition: pca.hpp:95
std::vector< size_t > sort_indices(RandomAccessIterator first, RandomAccessIterator last, Comparator comparator)
Get the indices to the sorted order of the given range.
Definition: algorithm.hpp:182
Matrix operators.
PcaData principal_component_analysis(Matrix< double > const &data, size_t components, PcaStandardization standardization)
Perfom a Principal Component Analysis on a given data Matrix.
Definition: pca.cpp:260
Matrix< double > eigenvectors
Definition: pca.hpp:98
TridiagonalDecompositionData reduce_to_tridiagonal_matrix(Matrix< double > &data)
Triangular decomposition of a symmetric matrix.
Definition: pca.cpp:49
void scale(Histogram &h, double factor)
Definition: operations.cpp:54
Matrix< double > projection
Definition: pca.hpp:99
PcaStandardization
Setting for principal_component_analysis() to determine which form of standardization of the data to ...
Definition: pca.hpp:49
void tridiagonal_ql_algorithm(Matrix< double > &data, TridiagonalDecompositionData &tri, size_t max_iterations)
Reduce a symmetric matrix to a symmetric tridiagonal matrix.
Definition: pca.cpp:160
std::vector< double > eigenvalues
Definition: pca.hpp:88
Standardize the mean and variance of the data before performing the PCA.
Helper structure used for the eigenvalue decomposition in reduce_to_tridiagonal_matrix() and tridiago...
Definition: pca.hpp:86
std::vector< MeanStddevPair > standardize_cols(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the columns of a Matrix by subtracting the mean and scaling to unit variance.
Definition: math/matrix.cpp:94