A toolkit for working with phylogenetic data.
v0.24.0
math/matrix.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
36 #include <algorithm>
37 #include <cassert>
38 #include <cmath>
39 #include <stdexcept>
40 
41 #ifdef GENESIS_OPENMP
42 # include <omp.h>
43 #endif
44 
45 namespace genesis {
46 namespace utils {
47 
48 // ================================================================================================
49 // Normalize
50 // ================================================================================================
51 
52 std::vector<MinMaxPair<double>> normalize_cols( Matrix<double>& data )
53 {
54  auto col_minmax = matrix_col_minmax( data );
55  assert( col_minmax.size() == data.cols() );
56 
57  // Iterate the matrix.
58  #pragma omp parallel for
59  for( size_t r = 0; r < data.rows(); ++r ) {
60  for( size_t c = 0; c < data.cols(); ++c ) {
61  // Adjust column values.
62  assert( col_minmax[c].max >= col_minmax[c].min );
63  double diff = col_minmax[c].max - col_minmax[c].min;
64  data( r, c ) = ( data( r, c ) - col_minmax[c].min ) / diff;
65  }
66  }
67 
68  return col_minmax;
69 }
70 
71 std::vector<MinMaxPair<double>> normalize_rows( Matrix<double>& data )
72 {
73  auto row_minmax = matrix_row_minmax( data );
74  assert( row_minmax.size() == data.rows() );
75 
76  // Iterate the matrix.
77  #pragma omp parallel for
78  for( size_t r = 0; r < data.rows(); ++r ) {
79  for( size_t c = 0; c < data.cols(); ++c ) {
80  // Adjust row values.
81  assert( row_minmax[r].max >= row_minmax[r].min );
82  double diff = row_minmax[r].max - row_minmax[r].min;
83  data( r, c ) = ( data( r, c ) - row_minmax[r].min ) / diff;
84  }
85  }
86 
87  return row_minmax;
88 }
89 
90 // ================================================================================================
91 // Standardize
92 // ================================================================================================
93 
94 std::vector<MeanStddevPair> standardize_cols(
95  Matrix<double>& data,
96  bool scale_means,
97  bool scale_std
98 ) {
99  auto col_mean_stddev = std::vector<MeanStddevPair>( data.cols() );
100 
101  // Iterate the matrix.
102  #pragma omp parallel for
103  for( size_t c = 0; c < data.cols(); ++c ) {
104  col_mean_stddev[c] = mean_stddev( data.col(c).begin(), data.col(c).end(), 0.0000001 );
105 
106  for( size_t r = 0; r < data.rows(); ++r ) {
107 
108  // Subtract mean (i.e., center data).
109  if( scale_means ) {
110  data( r, c ) -= col_mean_stddev[c].mean;
111  }
112 
113  // Scale to unit variance, if needed.
114  if( scale_std ) {
115  assert( col_mean_stddev[c].stddev > 0.0 );
116  data( r, c ) /= col_mean_stddev[c].stddev;
117  }
118  }
119  }
120 
121  return col_mean_stddev;
122 }
123 
124 std::vector<MeanStddevPair> standardize_rows(
125  Matrix<double>& data,
126  bool scale_means,
127  bool scale_std
128 ) {
129  auto row_mean_stddev = std::vector<MeanStddevPair>( data.rows() );
130 
131  // Iterate the matrix.
132  #pragma omp parallel for
133  for( size_t r = 0; r < data.rows(); ++r ) {
134  row_mean_stddev[r] = mean_stddev( data.row(r).begin(), data.row(r).end(), 0.0000001 );
135 
136  for( size_t c = 0; c < data.cols(); ++c ) {
137 
138  // Subtract mean (i.e., center data).
139  if( scale_means ) {
140  data( r, c ) -= row_mean_stddev[r].mean;
141  }
142 
143  // Scale to unit variance, if needed.
144  if( scale_std ) {
145  assert( row_mean_stddev[r].stddev > 0.0 );
146  data( r, c ) /= row_mean_stddev[r].stddev;
147  }
148  }
149  }
150 
151  return row_mean_stddev;
152 }
153 
154 // =================================================================================================
155 // Filter Constant Columns
156 // =================================================================================================
157 
158 std::vector<size_t> filter_constant_columns( utils::Matrix<double>& data, double epsilon )
159 {
160  // Get the column-wise min and max values.
161  auto const col_minmax = utils::matrix_col_minmax( data );
162 
163  // Store which columns to keep, by index.
164  std::vector<size_t> keep_cols;
165  for( size_t c = 0; c < data.cols(); ++c ) {
166  // Non finite columns are left out in any case.
167  if( ! std::isfinite( col_minmax[c].min ) || ! std::isfinite( col_minmax[c].max )) {
168  continue;
169  }
170 
171  assert( col_minmax[c].min <= col_minmax[c].max );
172  if (( col_minmax[c].max - col_minmax[c].min ) > epsilon ) {
173  keep_cols.push_back( c );
174  }
175  }
176  assert( keep_cols.size() <= data.cols() );
177 
178  // Produce new, filtered matrix.
179  auto new_mat = utils::Matrix<double>( data.rows(), keep_cols.size() );
180 
181  #pragma omp parallel for
182  for( size_t r = 0; r < data.rows(); ++r ) {
183  for( size_t i = 0; i < keep_cols.size(); ++i ) {
184  new_mat( r, i ) = data( r, keep_cols[i] );
185  }
186  }
187 
188  // Overwrite the matrix.
189  data = new_mat;
190  return keep_cols;
191 }
192 
193 // =================================================================================================
194 // Correlation and Covariance
195 // =================================================================================================
196 
198 {
199  // Standardize mean and variance.
200  auto stddata = data;
201  standardize_cols( stddata, true, true );
202 
203  // Calculate matrix. First build the sum of squares, then normalize.
204  auto sscp = sums_of_squares_and_cross_products_matrix( stddata );
205  for( auto& elem : sscp ) {
206  elem /= static_cast<double>( data.rows() );
207  }
208  return sscp;
209 }
210 
212 {
213  // Standardize mean, but not the variance.
214  auto stddata = data;
215  standardize_cols( stddata, true, false );
216 
217  // Calculate matrix. First build the sum of squares, then normalize.
218  auto sscp = sums_of_squares_and_cross_products_matrix( stddata );
219  for( auto& elem : sscp ) {
220  elem /= static_cast<double>( data.rows() );
221  }
222  return sscp;
223 }
224 
226 {
227  auto mat = Matrix<double>( data.cols(), data.cols() );
228 
229  // Calculate the covariance matrix.
230  for( size_t c1 = 0; c1 < data.cols(); ++c1 ) {
231  for( size_t c2 = c1; c2 < data.cols(); ++c2 ) {
232  mat( c1, c2 ) = 0.0;
233  for( size_t r = 0; r < data.rows(); ++r ) {
234  mat( c1, c2 ) += data( r, c1 ) * data( r, c2 );
235  }
236  mat( c2, c1 ) = mat( c1, c2 );
237  }
238  }
239 
240  return mat;
241 }
242 
243 } // namespace utils
244 } // namespace genesis
std::vector< MinMaxPair< T > > matrix_row_minmax(Matrix< T > const &data, bool ignore_non_finite_values=true)
Calculate the row-wise min and max values of a Matrix.
Matrix< double > sums_of_squares_and_cross_products_matrix(Matrix< double > const &data)
Calculate the Sums of Squares and Cross Products Matrix (SSCP Matrix).
std::vector< MeanStddevPair > standardize_rows(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the rows of a Matrix by subtracting the mean and scaling to unit variance.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
MatrixCol< self_type, value_type > col(size_t col)
MatrixRow< self_type, value_type > row(size_t row)
Matrix< double > covariance_matrix(Matrix< double > const &data)
Calculate the covariance Matrix of a given data Matrix.
std::vector< MinMaxPair< double > > normalize_cols(Matrix< double > &data)
Normalize the columns of a Matrix so that all values are in the range [ 0.0, 1.0 ].
Definition: math/matrix.cpp:52
std::vector< MinMaxPair< double > > normalize_rows(Matrix< double > &data)
Normalize the rows of a Matrix so that all values are in the range [ 0.0, 1.0 ].
Definition: math/matrix.cpp:71
std::vector< MinMaxPair< T > > matrix_col_minmax(Matrix< T > const &data, bool ignore_non_finite_values=true)
Calculate the column-wise min and max values of a Matrix.
Definition: math/matrix.hpp:99
std::vector< size_t > filter_constant_columns(utils::Matrix< double > &data, double epsilon)
Filter out columns that have nearly constant values, measured using an epsilon.
std::vector< MeanStddevPair > standardize_cols(Matrix< double > &data, bool scale_means, bool scale_std)
Standardize the columns of a Matrix by subtracting the mean and scaling to unit variance.
Definition: math/matrix.cpp:94
Matrix< double > correlation_matrix(Matrix< double > const &data)
Calculate the correlation Matrix of a given data Matrix.
MeanStddevPair mean_stddev(ForwardIterator first, ForwardIterator last, double epsilon=-1.0)
Calculate the arithmetic mean and standard deviation of a range of double elements.
Definition: statistics.hpp:358