A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
statistics.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 
35 #include <algorithm>
36 #include <cassert>
37 #include <cmath>
38 #include <stdexcept>
39 
40 #ifdef GENESIS_OPENMP
41 # include <omp.h>
42 #endif
43 
44 namespace genesis {
45 namespace utils {
46 
47 // ================================================================================================
48 // Normalize
49 // ================================================================================================
50 
51 std::vector<MinMaxPair<double>> normalize_cols( Matrix<double>& data )
52 {
53  auto col_minmax = matrix_col_minmax( data );
54  assert( col_minmax.size() == data.cols() );
55 
56  // Iterate the matrix.
57  #pragma omp parallel for
58  for( size_t r = 0; r < data.rows(); ++r ) {
59  for( size_t c = 0; c < data.cols(); ++c ) {
60  // Adjust column values.
61  assert( col_minmax[c].max >= col_minmax[c].min );
62  double diff = col_minmax[c].max - col_minmax[c].min;
63  data( r, c ) = ( data( r, c ) - col_minmax[c].min ) / diff;
64  }
65  }
66 
67  return col_minmax;
68 }
69 
70 std::vector<MinMaxPair<double>> normalize_rows( Matrix<double>& data )
71 {
72  auto row_minmax = matrix_row_minmax( data );
73  assert( row_minmax.size() == data.rows() );
74 
75  // Iterate the matrix.
76  #pragma omp parallel for
77  for( size_t r = 0; r < data.rows(); ++r ) {
78  for( size_t c = 0; c < data.cols(); ++c ) {
79  // Adjust row values.
80  assert( row_minmax[r].max >= row_minmax[r].min );
81  double diff = row_minmax[r].max - row_minmax[r].min;
82  data( r, c ) = ( data( r, c ) - row_minmax[r].min ) / diff;
83  }
84  }
85 
86  return row_minmax;
87 }
88 
89 // ================================================================================================
90 // Standardize
91 // ================================================================================================
92 
93 std::vector<MeanStddevPair> standardize_cols(
94  Matrix<double>& data,
95  bool scale_means,
96  bool scale_std
97 ) {
98  auto col_mean_stddev = matrix_col_mean_stddev( data, 0.0000001 );
99  assert( col_mean_stddev.size() == data.cols() );
100 
101  // Iterate the matrix.
102  #pragma omp parallel for
103  for( size_t r = 0; r < data.rows(); ++r ) {
104  for( size_t c = 0; c < data.cols(); ++c ) {
105 
106  // Subtract mean (i.e., center data).
107  if( scale_means ) {
108  data( r, c ) -= col_mean_stddev[c].mean;
109  }
110 
111  // Scale to unit variance, if needed.
112  if( scale_std ) {
113  assert( col_mean_stddev[c].stddev > 0.0 );
114  data( r, c ) /= col_mean_stddev[c].stddev;
115  }
116  }
117  }
118 
119  return col_mean_stddev;
120 }
121 
122 std::vector<MeanStddevPair> standardize_rows(
123  Matrix<double>& data,
124  bool scale_means,
125  bool scale_std
126 ) {
127  auto row_mean_stddev = matrix_row_mean_stddev( data, 0.0000001 );
128  assert( row_mean_stddev.size() == data.rows() );
129 
130  // Iterate the matrix.
131  #pragma omp parallel for
132  for( size_t r = 0; r < data.rows(); ++r ) {
133  for( size_t c = 0; c < data.cols(); ++c ) {
134 
135  // Subtract mean (i.e., center data).
136  if( scale_means ) {
137  data( r, c ) -= row_mean_stddev[r].mean;
138  }
139 
140  // Scale to unit variance, if needed.
141  if( scale_std ) {
142  assert( row_mean_stddev[r].stddev > 0.0 );
143  data( r, c ) /= row_mean_stddev[r].stddev;
144  }
145  }
146  }
147 
148  return row_mean_stddev;
149 }
150 
151 // ================================================================================================
152 // Mean and Stddev
153 // ================================================================================================
154 
155 MeanStddevPair matrix_mean_stddev( Matrix<double> const& data, double epsilon)
156 {
157  return mean_stddev( data.data(), epsilon );
158 }
159 
160 std::vector<MeanStddevPair> matrix_col_mean_stddev( Matrix<double> const& data, double epsilon )
161 {
162  auto ret = std::vector<MeanStddevPair>( data.cols(), { 0.0, 0.0 } );
163 
164  // Nothing to do. Better stop here or we risk dividing by zero.
165  if( data.rows() == 0 ) {
166  return ret;
167  }
168 
169  // Iterate columns.
170  #pragma omp parallel for
171  for( size_t c = 0; c < data.cols(); ++c ) {
172  double mean = 0.0;
173  double stddev = 0.0;
174 
175  // Calculate column mean.
176  for( size_t r = 0; r < data.rows(); ++r ) {
177  mean += data( r, c );
178  }
179  mean /= static_cast<double>( data.rows() );
180 
181  // Calculate column std dev.
182  for( size_t r = 0; r < data.rows(); ++r ) {
183  stddev += (( data( r, c ) - mean ) * ( data( r, c ) - mean ));
184  }
185  stddev /= static_cast<double>( data.rows() );
186  stddev = std::sqrt(stddev);
187 
188  // The following in an inelegant (but usual) way to handle near-zero values,
189  // which later would cause a division by zero.
190  assert( stddev >= 0.0 );
191  if( stddev <= epsilon ){
192  stddev = 1.0;
193  }
194 
195  // Set result entries.
196  ret[ c ].mean = mean;
197  ret[ c ].stddev = stddev;
198  }
199 
200  return ret;
201 }
202 
203 std::vector<MeanStddevPair> matrix_row_mean_stddev( Matrix<double> const& data, double epsilon )
204 {
205  auto ret = std::vector<MeanStddevPair>( data.rows(), { 0.0, 0.0 } );
206 
207  // Nothing to do. Better stop here or we risk dividing by zero.
208  if( data.cols() == 0 ) {
209  return ret;
210  }
211 
212  // Iterate columns.
213  #pragma omp parallel for
214  for( size_t r = 0; r < data.rows(); ++r ) {
215  double mean = 0.0;
216  double stddev = 0.0;
217 
218  // Calculate row mean.
219  for( size_t c = 0; c < data.cols(); ++c ) {
220  mean += data( r, c );
221  }
222  mean /= static_cast<double>( data.cols() );
223 
224  // Calculate column std dev.
225  for( size_t c = 0; c < data.cols(); ++c ) {
226  stddev += (( data( r, c ) - mean ) * ( data( r, c ) - mean ));
227  }
228  stddev /= static_cast<double>( data.cols() );
229  stddev = std::sqrt(stddev);
230 
231  // The following in an inelegant (but usual) way to handle near-zero values,
232  // which later would cause a division by zero.
233  assert( stddev >= 0.0 );
234  if( stddev <= epsilon ){
235  stddev = 1.0;
236  }
237 
238  // Set result entries.
239  ret[ r ].mean = mean;
240  ret[ r ].stddev = stddev;
241  }
242 
243  return ret;
244 }
245 
246 // =================================================================================================
247 // Quartiles
248 // =================================================================================================
249 
251  Matrix<double> const& data
252 ) {
253  // We make an expensive copy... Not nice, but works for now.
254  auto cpy = data.data();
255  std::sort( cpy.begin(), cpy.end() );
256  return quartiles( cpy );
257 }
258 
260  Matrix<double> const& data,
261  size_t row
262 ) {
263  // Fill a vector with the values of the row, and sort it.
264  auto tmp = data.row( row );
265  std::sort( tmp.begin(), tmp.end() );
266 
267  return quartiles( tmp );
268 }
269 
270 std::vector<Quartiles> matrix_row_quartiles(
271  Matrix<double> const& data
272 ) {
273  auto result = std::vector<Quartiles>( data.rows(), Quartiles() );
274 
275  #pragma omp parallel for
276  for( size_t r = 0; r < data.rows(); ++r ) {
277  result[r] = matrix_row_quartiles( data, r );
278  }
279 
280  return result;
281 }
282 
284  Matrix<double> const& data,
285  size_t col
286 ) {
287  // Fill a vector with the values of the column, and sort it.
288  auto tmp = data.col( col );
289  std::sort( tmp.begin(), tmp.end() );
290 
291  return quartiles( tmp );
292 }
293 
294 std::vector<Quartiles> matrix_col_quartiles(
295  Matrix<double> const& data
296 ) {
297  auto result = std::vector<Quartiles>( data.cols(), Quartiles() );
298 
299  #pragma omp parallel for
300  for( size_t c = 0; c < data.cols(); ++c ) {
301  result[c] = matrix_col_quartiles( data, c );
302  }
303 
304  return result;
305 }
306 
307 // ================================================================================================
308 // Correlation Matrix
309 // ================================================================================================
310 
312 {
313  // Standardize mean and variance.
314  auto stddata = data;
315  standardize_cols( stddata, true, true );
316 
317  // Calculate matrix. First build the sum of squares, then normalize.
318  auto sscp = sums_of_squares_and_cross_products_matrix( stddata );
319  for( auto& elem : sscp ) {
320  elem /= static_cast<double>( data.rows() );
321  }
322  return sscp;
323 }
324 
325 // ================================================================================================
326 // Covariance Matrix
327 // ================================================================================================
328 
330 {
331  // Standardize mean, but not the variance.
332  auto stddata = data;
333  standardize_cols( stddata, true, false );
334 
335  // Calculate matrix. First build the sum of squares, then normalize.
336  auto sscp = sums_of_squares_and_cross_products_matrix( stddata );
337  for( auto& elem : sscp ) {
338  elem /= static_cast<double>( data.rows() );
339  }
340  return sscp;
341 }
342 
343 // ================================================================================================
344 // Sums of Squares Cross Products Matrix
345 // ================================================================================================
346 
348 {
349  auto mat = Matrix<double>( data.cols(), data.cols() );
350 
351  // Calculate the covariance matrix.
352  for( size_t c1 = 0; c1 < data.cols(); ++c1 ) {
353  for( size_t c2 = c1; c2 < data.cols(); ++c2 ) {
354  mat( c1, c2 ) = 0.0;
355  for( size_t r = 0; r < data.rows(); ++r ) {
356  mat( c1, c2 ) += data( r, c1 ) * data( r, c2 );
357  }
358  mat( c2, c1 ) = mat( c1, c2 );
359  }
360  }
361 
362  return mat;
363 }
364 
365 // =================================================================================================
366 // Pearson Correlation Coefficient
367 // =================================================================================================
368 
370  Matrix<double> const& mat1, size_t col1,
371  Matrix<double> const& mat2, size_t col2
372 ) {
373  // Checks.
374  if( mat1.rows() != mat2.rows() ) {
375  throw std::runtime_error( "Matrices need to have same number of rows." );
376  }
377  if( col1 >= mat1.cols() || col2 >= mat2.cols() ) {
378  throw std::runtime_error( "Column indices cannot be bigger then number of columns." );
379  }
380 
381  // Make copies. Not the best strategy, but works for now.
382  auto c1 = mat1.col( col1 );
383  auto c2 = mat2.col( col2 );
384 
385  return pearson_correlation_coefficient( c1, c2 );
386 }
387 
389  Matrix<double> const& mat1, size_t row1,
390  Matrix<double> const& mat2, size_t row2
391 ) {
392  // Checks.
393  if( mat1.cols() != mat2.cols() ) {
394  throw std::runtime_error( "Matrices need to have same number of columns." );
395  }
396  if( row1 >= mat1.rows() || row2 >= mat2.rows() ) {
397  throw std::runtime_error( "Row indices cannot be bigger then number of rows." );
398  }
399 
400  // Make copies. Not the best strategy, but works for now.
401  auto r1 = mat1.row( row1 );
402  auto r2 = mat2.row( row2 );
403 
404  return pearson_correlation_coefficient( r1, r2 );
405 }
406 
408  Matrix<double> const& mat1, size_t col1,
409  Matrix<double> const& mat2, size_t col2
410 ) {
411  if( mat1.rows() != mat2.rows() ) {
412  throw std::runtime_error( "Matrices need to have same number of rows." );
413  }
414  if( col1 >= mat1.cols() || col2 >= mat2.cols() ) {
415  throw std::runtime_error( "Column indices cannot be bigger then number of columns." );
416  }
417 
418  // Make copies. Not the best strategy, but works for now.
419  auto c1 = mat1.col( col1 );
420  auto c2 = mat2.col( col2 );
421 
423 }
424 
426  Matrix<double> const& mat1, size_t row1,
427  Matrix<double> const& mat2, size_t row2
428 ) {
429  // Checks.
430  if( mat1.cols() != mat2.cols() ) {
431  throw std::runtime_error( "Matrices need to have same number of columns." );
432  }
433  if( row1 >= mat1.rows() || row2 >= mat2.rows() ) {
434  throw std::runtime_error( "Row indices cannot be bigger then number of rows." );
435  }
436 
437  // Make copies. Not the best strategy, but works for now.
438  auto r1 = mat1.row( row1 );
439  auto r2 = mat2.row( row2 );
440 
442 }
443 
444 } // namespace utils
445 } // namespace genesis
size_t cols() const
Definition: matrix.hpp:156
double spearmans_rank_correlation_coefficient(std::vector< double > const &vec_a, std::vector< double > const &vec_b)
Calculate Spearman's Rank Correlation Coefficient between the entries of two vectors.
Definition: common.cpp:199
double matrix_row_pearson_correlation_coefficient(Matrix< double > const &mat1, size_t row1, Matrix< double > const &mat2, size_t row2)
Calculate the Pearson Correlation Coefficient between two row of two Matrices.
Definition: statistics.cpp:388
std::vector< MinMaxPair< T > > matrix_col_minmax(Matrix< T > const &data)
Calculate the column-wise min and max values of a Matrix.
Definition: statistics.hpp:71
std::vector< T > const & data() const
Definition: matrix.hpp:166
std::vector< MeanStddevPair > standardize_rows(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the rows of a Matrix by subtracting the mean and scaling to unit variance.
Definition: statistics.cpp:122
std::vector< MeanStddevPair > matrix_row_mean_stddev(Matrix< double > const &data, double epsilon)
Calcualte the row-wise mean and standard deviation of a Matrix.
Definition: statistics.cpp:203
Matrix< double > correlation_matrix(Matrix< double > const &data)
Calculate the correlation Matrix of a given data Matrix.
Definition: statistics.cpp:311
Quartiles matrix_quartiles(Matrix< double > const &data)
Calculate the Quartiles of the elmements in Matrix of double.
Definition: statistics.cpp:250
std::vector< MinMaxPair< T > > matrix_row_minmax(Matrix< T > const &data)
Calculate the row-wise min and max values of a Matrix.
Definition: statistics.hpp:106
Quartiles matrix_col_quartiles(Matrix< double > const &data, size_t col)
Definition: statistics.cpp:283
double matrix_col_pearson_correlation_coefficient(Matrix< double > const &mat1, size_t col1, Matrix< double > const &mat2, size_t col2)
Calculate the Pearson Correlation Coefficient between two columns of two Matrices.
Definition: statistics.cpp:369
double mean(const Histogram &h)
Compute the bin-weighted arithmetic mean.
Matrix statstic functions.
double pearson_correlation_coefficient(std::vector< double > const &vec_a, std::vector< double > const &vec_b)
Calculate the Pearson Correlation Coefficient between the entries of two vectors. ...
Definition: common.cpp:162
std::vector< T > row(size_t index) const
Definition: matrix.hpp:205
MeanStddevPair mean_stddev(std::vector< double > const &vec, double epsilon)
Calcualte the mean and standard deviation of a vector of double.
Definition: common.cpp:47
std::vector< MeanStddevPair > matrix_col_mean_stddev(Matrix< double > const &data, double epsilon)
Calcualte the column-wise mean and standard deviation of a Matrix.
Definition: statistics.cpp:160
Store a mean and a standard deviation value.
Definition: common.hpp:73
std::vector< MinMaxPair< double > > normalize_cols(Matrix< double > &data)
Normalize the columns of a Matrix so that all values are in the range [ 0.0, 1.0 ].
Definition: statistics.cpp:51
MeanStddevPair matrix_mean_stddev(Matrix< double > const &data, double epsilon)
Calcualte the mean and standard deviation of all elements in a Matrix.
Definition: statistics.cpp:155
std::vector< MinMaxPair< double > > normalize_rows(Matrix< double > &data)
Normalize the rows of a Matrix so that all values are in the range [ 0.0, 1.0 ].
Definition: statistics.cpp:70
size_t rows() const
Definition: matrix.hpp:151
Quartiles quartiles(std::vector< double > const &vec)
Calculate the Quartiles of a vector of double.
Definition: common.cpp:126
Quartiles matrix_row_quartiles(Matrix< double > const &data, size_t row)
Definition: statistics.cpp:259
Matrix< double > covariance_matrix(Matrix< double > const &data)
Calculate the covariance Matrix of a given data Matrix.
Definition: statistics.cpp:329
std::vector< T > col(size_t index) const
Definition: matrix.hpp:218
double matrix_row_spearmans_rank_correlation_coefficient(Matrix< double > const &mat1, size_t row1, Matrix< double > const &mat2, size_t row2)
Calculate Spearman's Rank Correlation Coefficient between two row of two Matrices.
Definition: statistics.cpp:425
Store the values of quartiles: q0 == min, q1 == 25%, q2 == 50%, q3 == 75%, q4 == max.
Definition: common.hpp:83
Matrix< double > sums_of_squares_and_cross_products_matrix(Matrix< double > const &data)
Calculate the Sums of Squares and Cross Products Matrix (SSCP Matrix).
Definition: statistics.cpp:347
double matrix_col_spearmans_rank_correlation_coefficient(Matrix< double > const &mat1, size_t col1, Matrix< double > const &mat2, size_t col2)
Calculate Spearman's Rank Correlation Coefficient between two columns of two Matrices.
Definition: statistics.cpp:407
std::vector< MeanStddevPair > standardize_cols(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the columns of a Matrix by subtracting the mean and scaling to unit variance.
Definition: statistics.cpp:93