A toolkit for working with phylogenetic data.
v0.24.0
math/matrix.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_MATH_MATRIX_H_
2 #define GENESIS_UTILS_MATH_MATRIX_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
36 
37 #include <cmath>
38 #include <limits>
39 #include <vector>
40 
41 namespace genesis {
42 namespace utils {
43 
44 // =================================================================================================
45 // Min Max
46 // =================================================================================================
47 
54 template< typename T >
55 MinMaxPair<T> matrix_minmax( Matrix<T> const& data, bool ignore_non_finite_values = true )
56 {
57  // Edge case
58  if( data.rows() == 0 || data.cols() == 0 ) {
59  return {
60  std::numeric_limits<double>::quiet_NaN(),
61  std::numeric_limits<double>::quiet_NaN()
62  };
63  }
64 
65  // Init with first element
66  auto ret = MinMaxPair<T>{
67  std::numeric_limits<double>::infinity(),
68  - std::numeric_limits<double>::infinity()
69  };
70 
71  // Iterate
72  size_t cnt = 0;
73  for( auto const& e : data ) {
74  if( std::isfinite( e ) || ! ignore_non_finite_values ) {
75  ret.min = std::min( ret.min, e );
76  ret.max = std::max( ret.max, e );
77  ++cnt;
78  }
79  }
80 
81  // If we found no valid values at all, return nan.
82  if( cnt == 0 ) {
83  return {
84  std::numeric_limits<double>::quiet_NaN(),
85  std::numeric_limits<double>::quiet_NaN()
86  };
87  }
88 
89  return ret;
90 }
91 
98 template< typename T >
99 std::vector<MinMaxPair<T>> matrix_col_minmax( Matrix<T> const& data, bool ignore_non_finite_values = true )
100 {
101  auto ret = std::vector<MinMaxPair<T>>( data.cols(), {
102  std::numeric_limits<double>::quiet_NaN(),
103  std::numeric_limits<double>::quiet_NaN()
104  });
105 
106  // Nothing to do.
107  if( data.rows() == 0 ) {
108  return ret;
109  }
110 
111  // Iterate
112  for( size_t c = 0; c < data.cols(); ++c ) {
113  ret[ c ].min = std::numeric_limits<double>::infinity();
114  ret[ c ].max = - std::numeric_limits<double>::infinity();
115 
116  size_t cnt = 0;
117  for( size_t r = 0; r < data.rows(); ++r ) {
118 
119  // Find min and max of the column. If the value is not finite, skip it.
120  auto const& e = data( r, c );
121  if( std::isfinite( e ) || ! ignore_non_finite_values ) {
122  ret[ c ].min = std::min( ret[ c ].min, data( r, c ) );
123  ret[ c ].max = std::max( ret[ c ].max, data( r, c ) );
124  ++cnt;
125  }
126  }
127 
128  // If we found no valid values at all, return nan.
129  if( cnt == 0 ) {
130  ret[ c ].min = std::numeric_limits<double>::quiet_NaN();
131  ret[ c ].max = std::numeric_limits<double>::quiet_NaN();
132  }
133  }
134 
135  return ret;
136 }
137 
144 template< typename T >
145 std::vector<MinMaxPair<T>> matrix_row_minmax( Matrix<T> const& data, bool ignore_non_finite_values = true )
146 {
147  auto ret = std::vector<MinMaxPair<T>>( data.rows(), {
148  std::numeric_limits<double>::quiet_NaN(),
149  std::numeric_limits<double>::quiet_NaN()
150  });
151 
152  // Nothing to do.
153  if( data.cols() == 0 ) {
154  return ret;
155  }
156 
157  for( size_t r = 0; r < data.rows(); ++r ) {
158  // Init with the first col.
159  ret[ r ].min = std::numeric_limits<double>::infinity();
160  ret[ r ].max = - std::numeric_limits<double>::infinity();
161 
162  // Go through all other cols. If the value is not finite, skip it.
163  size_t cnt = 0;
164  for( size_t c = 0; c < data.cols(); ++c ) {
165  auto const& e = data( r, c );
166  if( std::isfinite( e ) || ! ignore_non_finite_values ) {
167  ret[ r ].min = std::min( ret[ r ].min, data( r, c ) );
168  ret[ r ].max = std::max( ret[ r ].max, data( r, c ) );
169  ++cnt;
170  }
171  }
172 
173  // If we found no valid values at all, return nan.
174  if( cnt == 0 ) {
175  ret[ r ].min = std::numeric_limits<double>::quiet_NaN();
176  ret[ r ].max = std::numeric_limits<double>::quiet_NaN();
177  }
178  }
179 
180  return ret;
181 }
182 
189 template< typename T >
190 T matrix_sum( Matrix<T> const& data, bool ignore_non_finite_values = true )
191 {
192  // Get row sums.
193  auto sum = T{};
194  for( auto const& e : data ) {
195  if( std::isfinite( e ) || ! ignore_non_finite_values ) {
196  sum += e;
197  }
198  }
199  return sum;
200 }
201 
208 template< typename T >
209 std::vector<T> matrix_row_sums( Matrix<T> const& data, bool ignore_non_finite_values = true )
210 {
211  // Get row sums.
212  auto row_sums = std::vector<T>( data.rows(), T{} );
213  for( size_t i = 0; i < data.rows(); ++i ) {
214  for( size_t j = 0; j < data.cols(); ++j ) {
215  auto const& e = data( i, j );
216  if( std::isfinite( e ) || ! ignore_non_finite_values ) {
217  row_sums[ i ] += e;
218  }
219  }
220  }
221  return row_sums;
222 }
223 
230 template< typename T >
231 std::vector<T> matrix_col_sums( Matrix<T> const& data, bool ignore_non_finite_values = true )
232 {
233  // Get col sums.
234  auto col_sums = std::vector<T>( data.cols(), T{} );
235  for( size_t i = 0; i < data.rows(); ++i ) {
236  for( size_t j = 0; j < data.cols(); ++j ) {
237  auto const& e = data( i, j );
238  if( std::isfinite( e ) || ! ignore_non_finite_values ) {
239  col_sums[ j ] += e;
240  }
241  }
242  }
243  return col_sums;
244 }
245 
246 // =================================================================================================
247 // Normalization and Standardization
248 // =================================================================================================
249 
262 std::vector<MinMaxPair<double>> normalize_cols( Matrix<double>& data );
263 
276 std::vector<MinMaxPair<double>> normalize_rows( Matrix<double>& data );
277 
295 std::vector<MeanStddevPair> standardize_cols(
296  Matrix<double>& data,
297  bool scale_means = true,
298  bool scale_std = true
299 );
300 
318 std::vector<MeanStddevPair> standardize_rows(
319  Matrix<double>& data,
320  bool scale_means = true,
321  bool scale_std = true
322 );
323 
338 std::vector<size_t> filter_constant_columns(
339  Matrix<double>& data,
340  double epsilon = 1e-5
341 );
342 
343 // =================================================================================================
344 // Correlation and Covariance
345 // =================================================================================================
346 
354 
362 
367 
368 // =================================================================================================
369 // Sorting
370 // =================================================================================================
371 
383 template< typename T >
385 {
386  if( data.rows() != data.cols() ) {
387  throw std::runtime_error( "Symmetric sort only works on square matrices." );
388  }
389 
390  auto result = Matrix<T>( data.rows(), data.cols() );
391  auto row_sums = matrix_row_sums( data );
392  auto si = sort_indices( row_sums.begin(), row_sums.end() );
393 
394  for( size_t i = 0; i < data.rows(); ++i ) {
395  for( size_t j = 0; j < data.cols(); ++j ) {
396  result( i, j ) = data( si[i], si[j] );
397  }
398  }
399 
400  return result;
401 }
402 
414 template< typename T >
416 {
417  if( data.rows() != data.cols() ) {
418  throw std::runtime_error( "Symmetric sort only works on square matrices." );
419  }
420 
421  auto result = Matrix<T>( data.rows(), data.cols() );
422  auto col_sums = matrix_col_sums( data );
423  auto si = sort_indices( col_sums.begin(), col_sums.end() );
424 
425  for( size_t i = 0; i < data.rows(); ++i ) {
426  for( size_t j = 0; j < data.cols(); ++j ) {
427  result( i, j ) = data( si[i], si[j] );
428  }
429  }
430 
431  return result;
432 }
433 
442 template< typename T >
444 {
445  if( data.rows() != data.cols() ) {
446  throw std::runtime_error( "Symmetric sort only works on square matrices." );
447  }
448 
449  // Find the row and col that contains the max element in the rest of the matrix,
450  // that is, excluding the first rows and cols according to start.
451  auto find_max = []( Matrix<T> const& mat, size_t start ){
452  size_t max_r = start;
453  size_t max_c = start;
454  for( size_t r = start; r < mat.rows(); ++r ) {
455  for( size_t c = start; c < mat.cols(); ++c ) {
456  if( mat(r, c) > mat( max_r, max_c ) ) {
457  max_r = r;
458  max_c = c;
459  }
460  }
461  }
462  return std::make_pair( max_r, max_c );
463  };
464 
465  // Sort by swapping rows and cols.
466  auto mat = data;
467  assert( mat.rows() == mat.cols() );
468  for( size_t i = 0; i < mat.rows(); ++i ) {
469  auto const max_pair = find_max( mat, i );
470  matrix_swap_rows( mat, i, max_pair.first );
471  matrix_swap_cols( mat, i, max_pair.second );
472  }
473  return mat;
474 }
475 
476 // =================================================================================================
477 // Matrix Addition
478 // =================================================================================================
479 
485 template< typename T = double, typename A = double, typename B = double >
487 {
488  if( a.rows() != b.rows() || a.cols() != b.cols() ) {
489  throw std::runtime_error( "Cannot add matrices with different dimensions." );
490  }
491 
492  auto result = Matrix<T>( a.rows(), a.cols() );
493  for( size_t r = 0; r < a.rows(); ++r ) {
494  for( size_t c = 0; c < a.cols(); ++c ) {
495  result( r, c ) = a( r, c ) + b( r, c );
496  }
497  }
498  return result;
499 }
500 
504 template< typename T = double, typename A = double, typename B = double >
505 Matrix<T> matrix_addition( Matrix<A> const& matrix, B const& scalar )
506 {
507  auto result = Matrix<T>( matrix.rows(), matrix.cols() );
508  for( size_t r = 0; r < matrix.rows(); ++r ) {
509  for( size_t c = 0; c < matrix.cols(); ++c ) {
510  result( r, c ) = matrix( r, c ) + scalar;
511  }
512  }
513  return result;
514 }
515 
521 template< typename T = double, typename A = double, typename B = double >
523 {
524  if( a.rows() != b.rows() || a.cols() != b.cols() ) {
525  throw std::runtime_error( "Cannot add matrices with different dimensions." );
526  }
527 
528  auto result = Matrix<T>( a.rows(), a.cols() );
529  for( size_t r = 0; r < a.rows(); ++r ) {
530  for( size_t c = 0; c < a.cols(); ++c ) {
531  result( r, c ) = a( r, c ) - b( r, c );
532  }
533  }
534  return result;
535 }
536 
537 // =================================================================================================
538 // Matrix Multiplication
539 // =================================================================================================
540 
547 template< typename T = double, typename A = double, typename B = double >
549 {
550  if( a.cols() != b.rows() ) {
551  throw std::runtime_error( "Cannot multiply matrices if a.cols() != b.rows()." );
552  }
553 
554  // Simple and naive. Fast enough for the few occasions were we need this.
555  // If Genesis at some point starts to need more elaborate matrix operations, it might be
556  // worth including some proper library for this.
557  auto result = Matrix<T>( a.rows(), b.cols() );
558  for( size_t r = 0; r < a.rows(); ++r ) {
559  for( size_t c = 0; c < b.cols(); ++c ) {
560  result( r, c ) = 0.0;
561  for( size_t j = 0; j < a.cols(); ++j ) {
562  result( r, c ) += a( r, j ) * b( j, c );
563  }
564  }
565  }
566 
567  return result;
568 }
569 
577 template< typename T = double, typename A = double, typename B = double >
578 std::vector<T> matrix_multiplication( std::vector<A> const& a, Matrix<B> const& b )
579 {
580  if( a.size() != b.rows() ) {
581  throw std::runtime_error( "Cannot multiply vector with matrix if a.size() != b.rows()." );
582  }
583 
584  auto result = std::vector<T>( b.cols(), 0.0 );
585  for( size_t c = 0; c < b.cols(); ++c ) {
586  for( size_t j = 0; j < a.size(); ++j ) {
587  result[ c ] += a[ j ] * b( j, c );
588  }
589  }
590 
591  return result;
592 }
593 
601 template< typename T = double, typename A = double, typename B = double >
602 std::vector<T> matrix_multiplication( Matrix<A> const& a, std::vector<B> const& b )
603 {
604  if( a.cols() != b.size() ) {
605  throw std::runtime_error( "Cannot multiply matrix with vector if a.cols() != b.size()." );
606  }
607 
608  auto result = std::vector<T>( a.rows(), 0.0 );
609  for( size_t r = 0; r < a.rows(); ++r ) {
610  for( size_t j = 0; j < a.cols(); ++j ) {
611  result[ r ] += a( r, j ) * b[ j ];
612  }
613  }
614 
615  return result;
616 }
617 
621 template< typename T = double, typename A = double, typename B = double >
622 Matrix<T> matrix_multiplication( Matrix<A> const& matrix, B const& scalar )
623 {
624  auto result = Matrix<T>( matrix.rows(), matrix.cols() );
625  for( size_t r = 0; r < matrix.rows(); ++r ) {
626  for( size_t c = 0; c < matrix.cols(); ++c ) {
627  result( r, c ) = matrix( r, c ) * scalar;
628  }
629  }
630  return result;
631 }
632 
633 } // namespace utils
634 } // namespace genesis
635 
636 #endif // include guard
Matrix< T > matrix_multiplication(Matrix< A > const &a, Matrix< B > const &b)
Calculate the product of two Matrices.
std::vector< MinMaxPair< T > > matrix_row_minmax(Matrix< T > const &data, bool ignore_non_finite_values=true)
Calculate the row-wise min and max values of a Matrix.
Matrix< double > sums_of_squares_and_cross_products_matrix(Matrix< double > const &data)
Calculate the Sums of Squares and Cross Products Matrix (SSCP Matrix).
Store a pair of min and max values.
Definition: statistics.hpp:105
Matrix< T > matrix_sort_by_row_sum_symmetric(Matrix< T > const &data)
Sort rows and columns of a Matrix by the sum or the rows.
std::vector< MeanStddevPair > standardize_rows(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the rows of a Matrix by subtracting the mean and scaling to unit variance.
Matrix< T > matrix_subtraction(Matrix< A > const &a, Matrix< B > const &b)
Calculate the element-wise difference of two Matrices.
std::vector< T > matrix_row_sums(Matrix< T > const &data, bool ignore_non_finite_values=true)
Calculate the sum of each row and return the result as a vector.
double sum(const Histogram &h)
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::vector< size_t > sort_indices(RandomAccessIterator first, RandomAccessIterator last, Comparator comparator)
Get the indices to the sorted order of the given range.
Definition: algorithm.hpp:182
std::vector< T > matrix_col_sums(Matrix< T > const &data, bool ignore_non_finite_values=true)
Calculate the sum of each column and return the result as a vector.
T matrix_sum(Matrix< T > const &data, bool ignore_non_finite_values=true)
Calculate the sum of all elements in a Matrix.
MinMaxPair< T > matrix_minmax(Matrix< T > const &data, bool ignore_non_finite_values=true)
Calculate the min and max values of a Matrix.
Definition: math/matrix.hpp:55
Matrix< double > covariance_matrix(Matrix< double > const &data)
Calculate the covariance Matrix of a given data Matrix.
std::vector< MinMaxPair< double > > normalize_cols(Matrix< double > &data)
Normalize the columns of a Matrix so that all values are in the range [ 0.0, 1.0 ].
Definition: math/matrix.cpp:52
std::vector< MinMaxPair< double > > normalize_rows(Matrix< double > &data)
Normalize the rows of a Matrix so that all values are in the range [ 0.0, 1.0 ].
Definition: math/matrix.cpp:71
Matrix< T > matrix_sort_by_col_sum_symmetric(Matrix< T > const &data)
Sort rows and columns of a Matrix by the sum or the columns.
std::vector< MinMaxPair< T > > matrix_col_minmax(Matrix< T > const &data, bool ignore_non_finite_values=true)
Calculate the column-wise min and max values of a Matrix.
Definition: math/matrix.hpp:99
std::vector< size_t > filter_constant_columns(utils::Matrix< double > &data, double epsilon)
Filter out columns that have nearly constant values, measured using an epsilon.
Matrix< T > matrix_addition(Matrix< A > const &a, Matrix< B > const &b)
Calculate the element-wise sum of two Matrices.
Matrix< T > matrix_sort_diagonal_symmetric(Matrix< T > const &data)
Sort a Matrix so that the highest entries are on the diagonal.
std::vector< MeanStddevPair > standardize_cols(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the columns of a Matrix by subtracting the mean and scaling to unit variance.
Definition: math/matrix.cpp:94
Matrix< double > correlation_matrix(Matrix< double > const &data)
Calculate the correlation Matrix of a given data Matrix.