A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
pca.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <algorithm>
38 #include <cassert>
39 #include <cmath>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace utils {
44 
45 // ================================================================================================
46 // Reduce to Tridiagonal Matrix
47 // ================================================================================================
48 
50 {
51  if( data.rows() != data.cols() ) {
52  throw std::runtime_error(
53  "Expecting symmetrical matrix for reduce_to_tridiagonal_matrix()"
54  );
55  }
56 
57  // Init result.
59  tri.eigenvalues = std::vector<double>( data.cols() );
60  tri.intermediates = std::vector<double>( data.cols() );
61 
62  for( size_t i = data.cols()-1; i >= 1; --i ) {
63  size_t l = i - 1;
64  double h = 0.0;
65  double scale = 0.0;
66 
67  if( l > 0 ) {
68  for( size_t k = 0; k <= l; ++k ) {
69  scale += fabs(data( i, k ));
70  }
71 
72  if( scale == 0.0 ) {
73  tri.intermediates[i] = data( i, l );
74  } else {
75  for( size_t k = 0; k <= l; ++k ) {
76  data( i, k ) /= scale;
77  h += data( i, k ) * data( i, k );
78  }
79 
80  double f = data( i, l );
81  double g = ( f > 0 ? -sqrt(h) : sqrt(h) );
82  tri.intermediates[i] = scale * g;
83  h -= f * g;
84  data( i, l ) = f - g;
85  f = 0.0;
86 
87  for( size_t j = 0; j <= l; ++j ) {
88  data( j, i ) = data( i, j )/h;
89  g = 0.0;
90 
91  for( size_t k = 0; k <= j; ++k ) {
92  g += data( j, k ) * data( i, k );
93  }
94  for( size_t k = j+1; k <= l; ++k ) {
95  g += data( k, j ) * data( i, k );
96  }
97 
98  tri.intermediates[j] = g / h;
99  f += tri.intermediates[j] * data( i, j );
100  }
101 
102  double hh = f / (h + h);
103  for( size_t j = 0; j <= l; ++j ) {
104  f = data( i, j );
105  tri.intermediates[j] = g = tri.intermediates[j] - hh * f;
106  for( size_t k = 0; k <= j; ++k ) {
107  data( j, k ) -= (f * tri.intermediates[k] + g * data( i, k ));
108  }
109  }
110  }
111  } else {
112  tri.intermediates[i] = data( i, l );
113  }
114 
115  tri.eigenvalues[i] = h;
116  }
117 
118  tri.eigenvalues[0] = 0.0;
119  tri.intermediates[0] = 0.0;
120 
121  for( size_t i = 0; i < data.cols(); ++i ) {
122  int l = i - 1;
123  if( tri.eigenvalues[i] ) {
124  for( int j = 0; j <= l; ++j ) {
125  double g = 0.0;
126  for( int k = 0; k <= l; ++k ) {
127  g += data( i, k ) * data( k, j );
128  }
129  for( int k = 0; k <= l; ++k ) {
130  data( k, j ) -= g * data( k, i );
131  }
132  }
133  }
134 
135  tri.eigenvalues[i] = data( i, i );
136  data( i, i ) = 1.0;
137  for( int j = 0; j <= l; ++j ) {
138  data( i, j ) = 0.0;
139  data( j, i ) = 0.0;
140  }
141  }
142 
143  return tri;
144 }
145 
146 // ================================================================================================
147 // Tridiagonal QL Algorithm
148 // ================================================================================================
149 
151  Matrix<double>& data,
153  size_t max_iterations
154 ) {
155  if( data.rows() != data.cols() ) {
156  throw std::runtime_error(
157  "Expecting symmetrical matrix for tridiagonal_ql_algorithm()"
158  );
159  }
160  if( tri.eigenvalues.size() != data.cols() || tri.intermediates.size() != data.cols() ) {
161  throw std::runtime_error(
162  "Expecting TridiagonalDecompositionData vectors of the same size"
163  " as the data matrix in tridiagonal_ql_algorithm()"
164  );
165  }
166 
167  // Some shorthands.
168  auto& d = tri.eigenvalues;
169  auto& e = tri.intermediates;
170  int n = static_cast<int>( data.rows() );
171 
172  for( int i = 1; i < n; ++i ) {
173  e[i-1] = e[i];
174  }
175  e[n-1] = 0.0;
176 
177  for( int l = 0; l < n; ++l ) {
178  size_t iter = 0;
179  int m = 0;
180 
181  do {
182  for( m = l; m < n-1; ++m ) {
183  double dd = fabs(d[m]) + fabs(d[m+1]);
184  if (fabs(e[m]) + dd == dd) {
185  break;
186  }
187  }
188 
189  if( m != l ) {
190  if( max_iterations > 0 && iter++ == max_iterations ) {
191  throw std::runtime_error( "No convergence in tridiagonal_ql_algorithm()." );
192  }
193 
194  double s = 1.0;
195  double c = 1.0;
196  double p = 0.0;
197  double g = (d[l+1] - d[l]) / (2.0 * e[l]);
198  double r = sqrt((g * g) + 1.0);
199  auto sign = ( (g) < 0 ? -fabs(r) : fabs(r) );
200  g = d[m] - d[l] + e[l] / (g + sign);
201 
202  for( int i = m-1; i >= l; --i ) {
203  double f = s * e[i];
204  double b = c * e[i];
205 
206  if (fabs(f) >= fabs(g)) {
207  c = g / f;
208  r = sqrt((c * c) + 1.0);
209  e[i+1] = f * r;
210  c *= (s = 1.0/r);
211  } else {
212  s = f / g;
213  r = sqrt((s * s) + 1.0);
214  e[i+1] = g * r;
215  s *= (c = 1.0/r);
216  }
217 
218  g = d[i+1] - p;
219  r = (d[i] - g) * s + 2.0 * c * b;
220  p = s * r;
221  d[i+1] = g + p;
222  g = c * r - b;
223  for( int k = 0; k < n; ++k ) {
224  f = data( k, i+1 );
225  data( k, i+1 ) = s * data( k, i ) + c * f;
226  data( k, i ) = c * data( k, i ) - s * f;
227  }
228  }
229 
230  d[l] = d[l] - p;
231  e[l] = g;
232  e[m] = 0.0;
233  }
234  } while (m != l);
235  }
236 }
237 
238 // ================================================================================================
239 // Principal Component Analysis
240 // ================================================================================================
241 
243  Matrix<double> const& data,
244  size_t components,
245  PcaStandardization standardization
246 ) {
247  // Prepare result struct.
248  PcaData result;
249 
250  // Normalize data and get correlation/covariance matrix.
251  // We manually run the normalization step here, because we need the normalized data later.
252  // The resulting `symmat` is then then same as if after performing
253  // correlation_matrix / covariance_matrix / sums_of_squares_and_cross_products_matrix
254  // (depending on the settings for the standardization).
255  // The `symmat` is later overwritten by the tridiagonal decomposition algorithm and then
256  // contains the eigenvectors.
257  auto standardized_data = data;
258  if( standardization == PcaStandardization::kCorrelation ) {
259  standardize_cols( standardized_data, true, true );
260  } else if( standardization == PcaStandardization::kCovariance ) {
261  standardize_cols( standardized_data, true, false );
262  }
263  auto symmat = sums_of_squares_and_cross_products_matrix( standardized_data );
264  for( auto& elem : symmat ) {
265  elem /= static_cast<double>( standardized_data.rows() );
266  }
267 
268  // As symmaat is later overwritten, we store the correlation/covariance for later.
269  // auto covar = symmat;
270 
271  // Get number of desired PCA components.
272  if( components == 0 ) {
273  components = standardized_data.cols();
274  }
275  if( components > standardized_data.cols() ) {
276  throw std::runtime_error(
277  "Cannot calculate more PCA components than the original data has columns."
278  );
279  }
280 
281  // Eigenvalue Decompostion.
282  auto tri = reduce_to_tridiagonal_matrix( symmat );
283  tridiagonal_ql_algorithm( symmat, tri );
284 
285  // Some checks.
286  assert( tri.eigenvalues.size() == standardized_data.cols() );
287  assert( tri.intermediates.size() == standardized_data.cols() );
288  assert( symmat.rows() == standardized_data.cols() );
289  assert( symmat.cols() == standardized_data.cols() );
290 
291  // Sort Eigenvalues and Eigenvectors and store in result struct.
292  auto sorted_indices = sort_indices(
293  tri.eigenvalues.begin(),
294  tri.eigenvalues.end(),
295  std::greater<double>()
296  );
297  result.eigenvalues = std::vector<double>( components );
298  result.eigenvectors = Matrix<double>( symmat.rows(), components );
299  for( size_t c = 0; c < components; ++c ) {
300  result.eigenvalues[c] = tri.eigenvalues[ sorted_indices[c] ];
301 
302  for( size_t r = 0; r < symmat.rows(); ++r ) {
303  result.eigenvectors( r, c ) = symmat( r, sorted_indices[c] );
304  }
305  }
306 
307  // Store projections of row-points on pricipal components into result.
308  // This is a simple matrix multiplication of the normalized data with the eigenvectors.
309  result.projection = matrix_multiplication( standardized_data, result.eigenvectors );
310 
311  // Experimental: Calculate column projection.
312  // See original implememtation for code:
313  // https://github.com/davidhalter-archive/ardour/blob/master/libs/qm-dsp/maths/pca/pca.c
314  //
315  // auto col_proj = Matrix<double>( data.cols(), data.cols() );
316  // /* Form projections of col.-points on first three prin. components. */
317  // /* Store in 'covar', overwriting what was stored in this. */
318  // for( size_t j = 0; j < data.cols(); ++j ) {
319  // for( size_t i = 0; i < 3; ++i ) {
320  // col_proj(j,i) = 0.0;
321  // for( size_t k2 = 0; k2 < data.cols(); ++k2 ) {
322  // col_proj(j,i) += covar(j,k2) * result.eigenvectors( k2, data.cols()-i-1 );
323  // }
324  // if( result.eigenvalues[data.cols()-i-1] > 0.0005 ) {
325  // /* Guard against zero eigenvalue */
326  // col_proj(j,i) /= sqrt(result.eigenvalues[data.cols()-i-1]); /* Rescale */
327  // } else {
328  // col_proj(j,i) = 0.0; /* Standard kludge */
329  // }
330  // }
331  // }
332  //
333  // printf("\nProjections of column-points on first 3 prin. comps.:\n");
334  // for (size_t j = 0; j < data.cols(); j++) {
335  // for ( size_t k = 0; k < 3; k++) {
336  // printf("%12.4f", col_proj(j,k));
337  // }
338  // printf("\n");
339  // }
340 
341  return result;
342 }
343 
344 } // namespace utils
345 } // namespace genesis
Matrix< T > matrix_multiplication(Matrix< A > const &a, Matrix< B > const &b)
Calculate the product of two Matrices.
Provides some valuable algorithms that are not part of the C++ 11 STL.
std::vector< double > eigenvalues
Definition: pca.hpp:97
std::vector< double > intermediates
Definition: pca.hpp:89
Standardize the mean, but not the variance of the data before performing the PCA. ...
Helper stucture that collects the output of principal_component_analysis().
Definition: pca.hpp:95
std::vector< size_t > sort_indices(RandomAccessIterator first, RandomAccessIterator last, Comparator comparator)
Get the indices to the sorted order of the given range.
Definition: algorithm.hpp:124
Matrix operators.
Matrix< double > eigenvectors
Definition: pca.hpp:98
TridiagonalDecompositionData reduce_to_tridiagonal_matrix(Matrix< double > &data)
Triangular decomposition of a symmetric matrix.
Definition: pca.cpp:49
void scale(Histogram &h, double factor)
Definition: operations.cpp:54
Matrix< double > projection
Definition: pca.hpp:99
PcaStandardization
Setting for principal_component_analysis() to determine which form of standardization of the data to ...
Definition: pca.hpp:49
void tridiagonal_ql_algorithm(Matrix< double > &data, TridiagonalDecompositionData &tri, size_t max_iterations)
Reduce a symmetric matrix to a symmetric tridiagonal matrix.
Definition: pca.cpp:150
std::vector< double > eigenvalues
Definition: pca.hpp:88
Standardize the mean and variance of the data before performing the PCA.
PcaData principal_component_analysis(Matrix< double > const &data, size_t components, PcaStandardization standardization)
Perfom a Principal Component Analysis on a given data Matrix.
Definition: pca.cpp:242
Matrix< double > sums_of_squares_and_cross_products_matrix(Matrix< double > const &data)
Calculate the Sums of Squares and Cross Products Matrix (SSCP Matrix).
Definition: matrix.cpp:345
Helper structure used for the eigenvalue decomposition in reduce_to_tridiagonal_matrix() and tridiago...
Definition: pca.hpp:86
std::vector< MeanStddevPair > standardize_cols(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the columns of a Matrix by subtracting the mean and scaling to unit variance.
Definition: matrix.cpp:93