A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
matrix.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 
35 #include <algorithm>
36 #include <cassert>
37 #include <cmath>
38 #include <stdexcept>
39 
40 #ifdef GENESIS_OPENMP
41 # include <omp.h>
42 #endif
43 
44 namespace genesis {
45 namespace utils {
46 
47 // ================================================================================================
48 // Normalize
49 // ================================================================================================
50 
51 std::vector<MinMaxPair<double>> normalize_cols( Matrix<double>& data )
52 {
53  auto col_minmax = matrix_col_minmax( data );
54  assert( col_minmax.size() == data.cols() );
55 
56  // Iterate the matrix.
57  #pragma omp parallel for
58  for( size_t r = 0; r < data.rows(); ++r ) {
59  for( size_t c = 0; c < data.cols(); ++c ) {
60  // Adjust column values.
61  assert( col_minmax[c].max >= col_minmax[c].min );
62  double diff = col_minmax[c].max - col_minmax[c].min;
63  data( r, c ) = ( data( r, c ) - col_minmax[c].min ) / diff;
64  }
65  }
66 
67  return col_minmax;
68 }
69 
70 std::vector<MinMaxPair<double>> normalize_rows( Matrix<double>& data )
71 {
72  auto row_minmax = matrix_row_minmax( data );
73  assert( row_minmax.size() == data.rows() );
74 
75  // Iterate the matrix.
76  #pragma omp parallel for
77  for( size_t r = 0; r < data.rows(); ++r ) {
78  for( size_t c = 0; c < data.cols(); ++c ) {
79  // Adjust row values.
80  assert( row_minmax[r].max >= row_minmax[r].min );
81  double diff = row_minmax[r].max - row_minmax[r].min;
82  data( r, c ) = ( data( r, c ) - row_minmax[r].min ) / diff;
83  }
84  }
85 
86  return row_minmax;
87 }
88 
89 // ================================================================================================
90 // Standardize
91 // ================================================================================================
92 
93 std::vector<MeanStddevPair> standardize_cols(
94  Matrix<double>& data,
95  bool scale_means,
96  bool scale_std
97 ) {
98  auto col_mean_stddev = matrix_col_mean_stddev( data, 0.0000001 );
99  assert( col_mean_stddev.size() == data.cols() );
100 
101  // Iterate the matrix.
102  #pragma omp parallel for
103  for( size_t r = 0; r < data.rows(); ++r ) {
104  for( size_t c = 0; c < data.cols(); ++c ) {
105 
106  // Subtract mean (i.e., center data).
107  if( scale_means ) {
108  data( r, c ) -= col_mean_stddev[c].mean;
109  }
110 
111  // Scale to unit variance, if needed.
112  if( scale_std ) {
113  assert( col_mean_stddev[c].stddev > 0.0 );
114  data( r, c ) /= col_mean_stddev[c].stddev;
115  }
116  }
117  }
118 
119  return col_mean_stddev;
120 }
121 
122 std::vector<MeanStddevPair> standardize_rows(
123  Matrix<double>& data,
124  bool scale_means,
125  bool scale_std
126 ) {
127  auto row_mean_stddev = matrix_row_mean_stddev( data, 0.0000001 );
128  assert( row_mean_stddev.size() == data.rows() );
129 
130  // Iterate the matrix.
131  #pragma omp parallel for
132  for( size_t r = 0; r < data.rows(); ++r ) {
133  for( size_t c = 0; c < data.cols(); ++c ) {
134 
135  // Subtract mean (i.e., center data).
136  if( scale_means ) {
137  data( r, c ) -= row_mean_stddev[r].mean;
138  }
139 
140  // Scale to unit variance, if needed.
141  if( scale_std ) {
142  assert( row_mean_stddev[r].stddev > 0.0 );
143  data( r, c ) /= row_mean_stddev[r].stddev;
144  }
145  }
146  }
147 
148  return row_mean_stddev;
149 }
150 
151 // ================================================================================================
152 // Mean and Stddev
153 // ================================================================================================
154 
155 MeanStddevPair matrix_mean_stddev( Matrix<double> const& data, double epsilon)
156 {
157  return mean_stddev( data.data(), epsilon );
158 }
159 
160 std::vector<MeanStddevPair> matrix_col_mean_stddev( Matrix<double> const& data, double epsilon )
161 {
162  auto ret = std::vector<MeanStddevPair>( data.cols(), { 0.0, 0.0 } );
163 
164  // Nothing to do. Better stop here or we risk dividing by zero.
165  if( data.rows() == 0 ) {
166  return ret;
167  }
168 
169  // Iterate columns.
170  #pragma omp parallel for
171  for( size_t c = 0; c < data.cols(); ++c ) {
172  double mean = 0.0;
173  double stddev = 0.0;
174 
175  // Calculate column mean.
176  for( size_t r = 0; r < data.rows(); ++r ) {
177  mean += data( r, c );
178  }
179  mean /= static_cast<double>( data.rows() );
180 
181  // Calculate column std dev.
182  for( size_t r = 0; r < data.rows(); ++r ) {
183  stddev += (( data( r, c ) - mean ) * ( data( r, c ) - mean ));
184  }
185  stddev /= static_cast<double>( data.rows() );
186  stddev = std::sqrt(stddev);
187 
188  // The following in an inelegant (but usual) way to handle near-zero values,
189  // which later would cause a division by zero.
190  assert( stddev >= 0.0 );
191  if( stddev <= epsilon ){
192  stddev = 1.0;
193  }
194 
195  // Set result entries.
196  ret[ c ].mean = mean;
197  ret[ c ].stddev = stddev;
198  }
199 
200  return ret;
201 }
202 
203 std::vector<MeanStddevPair> matrix_row_mean_stddev( Matrix<double> const& data, double epsilon )
204 {
205  auto ret = std::vector<MeanStddevPair>( data.rows(), { 0.0, 0.0 } );
206 
207  // Nothing to do. Better stop here or we risk dividing by zero.
208  if( data.cols() == 0 ) {
209  return ret;
210  }
211 
212  // Iterate columns.
213  #pragma omp parallel for
214  for( size_t r = 0; r < data.rows(); ++r ) {
215  double mean = 0.0;
216  double stddev = 0.0;
217 
218  // Calculate row mean.
219  for( size_t c = 0; c < data.cols(); ++c ) {
220  mean += data( r, c );
221  }
222  mean /= static_cast<double>( data.cols() );
223 
224  // Calculate column std dev.
225  for( size_t c = 0; c < data.cols(); ++c ) {
226  stddev += (( data( r, c ) - mean ) * ( data( r, c ) - mean ));
227  }
228  stddev /= static_cast<double>( data.cols() );
229  stddev = std::sqrt(stddev);
230 
231  // The following in an inelegant (but usual) way to handle near-zero values,
232  // which later would cause a division by zero.
233  assert( stddev >= 0.0 );
234  if( stddev <= epsilon ){
235  stddev = 1.0;
236  }
237 
238  // Set result entries.
239  ret[ r ].mean = mean;
240  ret[ r ].stddev = stddev;
241  }
242 
243  return ret;
244 }
245 
246 // =================================================================================================
247 // Quartiles
248 // =================================================================================================
249 
251  Matrix<double> const& data
252 ) {
253  // We make an expensive copy... Not nice, but works for now.
254  auto cpy = data.data();
255  std::sort( cpy.begin(), cpy.end() );
256  return quartiles( cpy );
257 }
258 
260  Matrix<double> const& data,
261  size_t row
262 ) {
263  auto tmp = data.row( row ).to_vector();
264  std::sort( tmp.begin(), tmp.end() );
265 
266  return quartiles( tmp );
267 }
268 
269 std::vector<Quartiles> matrix_row_quartiles(
270  Matrix<double> const& data
271 ) {
272  auto result = std::vector<Quartiles>( data.rows(), Quartiles() );
273 
274  #pragma omp parallel for
275  for( size_t r = 0; r < data.rows(); ++r ) {
276  result[r] = matrix_row_quartiles( data, r );
277  }
278 
279  return result;
280 }
281 
283  Matrix<double> const& data,
284  size_t col
285 ) {
286  auto tmp = data.col( col ).to_vector();
287  std::sort( tmp.begin(), tmp.end() );
288 
289  return quartiles( tmp );
290 }
291 
292 std::vector<Quartiles> matrix_col_quartiles(
293  Matrix<double> const& data
294 ) {
295  auto result = std::vector<Quartiles>( data.cols(), Quartiles() );
296 
297  #pragma omp parallel for
298  for( size_t c = 0; c < data.cols(); ++c ) {
299  result[c] = matrix_col_quartiles( data, c );
300  }
301 
302  return result;
303 }
304 
305 // ================================================================================================
306 // Correlation Matrix
307 // ================================================================================================
308 
310 {
311  // Standardize mean and variance.
312  auto stddata = data;
313  standardize_cols( stddata, true, true );
314 
315  // Calculate matrix. First build the sum of squares, then normalize.
316  auto sscp = sums_of_squares_and_cross_products_matrix( stddata );
317  for( auto& elem : sscp ) {
318  elem /= static_cast<double>( data.rows() );
319  }
320  return sscp;
321 }
322 
323 // ================================================================================================
324 // Covariance Matrix
325 // ================================================================================================
326 
328 {
329  // Standardize mean, but not the variance.
330  auto stddata = data;
331  standardize_cols( stddata, true, false );
332 
333  // Calculate matrix. First build the sum of squares, then normalize.
334  auto sscp = sums_of_squares_and_cross_products_matrix( stddata );
335  for( auto& elem : sscp ) {
336  elem /= static_cast<double>( data.rows() );
337  }
338  return sscp;
339 }
340 
341 // ================================================================================================
342 // Sums of Squares Cross Products Matrix
343 // ================================================================================================
344 
346 {
347  auto mat = Matrix<double>( data.cols(), data.cols() );
348 
349  // Calculate the covariance matrix.
350  for( size_t c1 = 0; c1 < data.cols(); ++c1 ) {
351  for( size_t c2 = c1; c2 < data.cols(); ++c2 ) {
352  mat( c1, c2 ) = 0.0;
353  for( size_t r = 0; r < data.rows(); ++r ) {
354  mat( c1, c2 ) += data( r, c1 ) * data( r, c2 );
355  }
356  mat( c2, c1 ) = mat( c1, c2 );
357  }
358  }
359 
360  return mat;
361 }
362 
363 // =================================================================================================
364 // Correlation Coefficients
365 // =================================================================================================
366 
368  Matrix<double> const& mat1, size_t col1,
369  Matrix<double> const& mat2, size_t col2
370 ) {
371  // Checks.
372  if( mat1.rows() != mat2.rows() ) {
373  throw std::runtime_error( "Matrices need to have same number of rows." );
374  }
375  if( col1 >= mat1.cols() || col2 >= mat2.cols() ) {
376  throw std::runtime_error( "Column indices cannot be bigger then number of columns." );
377  }
378 
379  auto c1 = mat1.col( col1 );
380  auto c2 = mat2.col( col2 );
381  return pearson_correlation_coefficient( c1.begin(), c1.end(), c2.begin(), c2.end() );
382 }
383 
385  Matrix<double> const& mat1, size_t row1,
386  Matrix<double> const& mat2, size_t row2
387 ) {
388  // Checks.
389  if( mat1.cols() != mat2.cols() ) {
390  throw std::runtime_error( "Matrices need to have same number of columns." );
391  }
392  if( row1 >= mat1.rows() || row2 >= mat2.rows() ) {
393  throw std::runtime_error( "Row indices cannot be bigger then number of rows." );
394  }
395 
396  auto r1 = mat1.row( row1 );
397  auto r2 = mat2.row( row2 );
398  return pearson_correlation_coefficient( r1.begin(), r1.end(), r2.begin(), r2.end() );
399 }
400 
402  Matrix<double> const& mat1, size_t col1,
403  Matrix<double> const& mat2, size_t col2
404 ) {
405  if( mat1.rows() != mat2.rows() ) {
406  throw std::runtime_error( "Matrices need to have same number of rows." );
407  }
408  if( col1 >= mat1.cols() || col2 >= mat2.cols() ) {
409  throw std::runtime_error( "Column indices cannot be bigger then number of columns." );
410  }
411 
412  auto c1 = mat1.col( col1 );
413  auto c2 = mat2.col( col2 );
414  return spearmans_rank_correlation_coefficient( c1.begin(), c1.end(), c2.begin(), c2.end() );
415 }
416 
418  Matrix<double> const& mat1, size_t row1,
419  Matrix<double> const& mat2, size_t row2
420 ) {
421  // Checks.
422  if( mat1.cols() != mat2.cols() ) {
423  throw std::runtime_error( "Matrices need to have same number of columns." );
424  }
425  if( row1 >= mat1.rows() || row2 >= mat2.rows() ) {
426  throw std::runtime_error( "Row indices cannot be bigger then number of rows." );
427  }
428 
429  auto r1 = mat1.row( row1 );
430  auto r2 = mat2.row( row2 );
431  return spearmans_rank_correlation_coefficient( r1.begin(), r1.end(), r2.begin(), r2.end() );
432 }
433 
434 } // namespace utils
435 } // namespace genesis
double matrix_row_pearson_correlation_coefficient(Matrix< double > const &mat1, size_t row1, Matrix< double > const &mat2, size_t row2)
Calculate the Pearson Correlation Coefficient between two row of two Matrices.
Definition: matrix.cpp:384
std::vector< MinMaxPair< T > > matrix_col_minmax(Matrix< T > const &data)
Calculate the column-wise min and max values of a Matrix.
Definition: math/matrix.hpp:70
double pearson_correlation_coefficient(ForwardIteratorA first_a, ForwardIteratorA last_a, ForwardIteratorB first_b, ForwardIteratorB last_b)
Calculate the Pearson Correlation Coefficient between two ranges of double.
Definition: statistics.hpp:426
std::vector< MeanStddevPair > standardize_rows(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the rows of a Matrix by subtracting the mean and scaling to unit variance.
Definition: matrix.cpp:122
std::vector< MeanStddevPair > matrix_row_mean_stddev(Matrix< double > const &data, double epsilon)
Calcualte the row-wise mean and standard deviation of a Matrix.
Definition: matrix.cpp:203
Quartiles quartiles(RandomAccessIterator first, RandomAccessIterator last)
Definition: statistics.hpp:248
Matrix< double > correlation_matrix(Matrix< double > const &data)
Calculate the correlation Matrix of a given data Matrix.
Definition: matrix.cpp:309
Quartiles matrix_quartiles(Matrix< double > const &data)
Calculate the Quartiles of the elmements in Matrix of double.
Definition: matrix.cpp:250
std::vector< MinMaxPair< T > > matrix_row_minmax(Matrix< T > const &data)
Calculate the row-wise min and max values of a Matrix.
Quartiles matrix_col_quartiles(Matrix< double > const &data, size_t col)
Definition: matrix.cpp:282
container_type const & data() const
double matrix_col_pearson_correlation_coefficient(Matrix< double > const &mat1, size_t col1, Matrix< double > const &mat2, size_t col2)
Calculate the Pearson Correlation Coefficient between two columns of two Matrices.
Definition: matrix.cpp:367
double mean(const Histogram &h)
Compute the bin-weighted arithmetic mean.
MatrixCol< self_type, value_type > col(size_t col)
MatrixRow< self_type, value_type > row(size_t row)
std::vector< MeanStddevPair > matrix_col_mean_stddev(Matrix< double > const &data, double epsilon)
Calcualte the column-wise mean and standard deviation of a Matrix.
Definition: matrix.cpp:160
Store a mean and a standard deviation value.
Definition: statistics.hpp:77
std::vector< MinMaxPair< double > > normalize_cols(Matrix< double > &data)
Normalize the columns of a Matrix so that all values are in the range [ 0.0, 1.0 ].
Definition: matrix.cpp:51
MeanStddevPair matrix_mean_stddev(Matrix< double > const &data, double epsilon)
Calcualte the mean and standard deviation of all elements in a Matrix.
Definition: matrix.cpp:155
std::vector< MinMaxPair< double > > normalize_rows(Matrix< double > &data)
Normalize the rows of a Matrix so that all values are in the range [ 0.0, 1.0 ].
Definition: matrix.cpp:70
Quartiles matrix_row_quartiles(Matrix< double > const &data, size_t row)
Definition: matrix.cpp:259
Matrix< double > covariance_matrix(Matrix< double > const &data)
Calculate the covariance Matrix of a given data Matrix.
Definition: matrix.cpp:327
double matrix_row_spearmans_rank_correlation_coefficient(Matrix< double > const &mat1, size_t row1, Matrix< double > const &mat2, size_t row2)
Calculate Spearman's Rank Correlation Coefficient between two row of two Matrices.
Definition: matrix.cpp:417
Store the values of quartiles: q0 == min, q1 == 25%, q2 == 50%, q3 == 75%, q4 == max.
Definition: statistics.hpp:87
double spearmans_rank_correlation_coefficient(RandomAccessIteratorA first_a, RandomAccessIteratorA last_a, RandomAccessIteratorB first_b, RandomAccessIteratorB last_b)
Calculate Spearman's Rank Correlation Coefficient between two ranges of double.
Definition: statistics.hpp:506
Matrix< double > sums_of_squares_and_cross_products_matrix(Matrix< double > const &data)
Calculate the Sums of Squares and Cross Products Matrix (SSCP Matrix).
Definition: matrix.cpp:345
double matrix_col_spearmans_rank_correlation_coefficient(Matrix< double > const &mat1, size_t col1, Matrix< double > const &mat2, size_t col2)
Calculate Spearman's Rank Correlation Coefficient between two columns of two Matrices.
Definition: matrix.cpp:401
std::vector< MeanStddevPair > standardize_cols(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the columns of a Matrix by subtracting the mean and scaling to unit variance.
Definition: matrix.cpp:93
MeanStddevPair mean_stddev(ForwardIterator first, ForwardIterator last, double epsilon=-1.0)
Calculate the mean and standard deviation of a range of double elements.
Definition: statistics.hpp:114