A toolkit for working with phylogenetic data.
v0.24.0
mds.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
31 /*
32  The implementation is based on the [SimpleMatrix library](https://sites.google.com/site/simpmatrix/)
33  by [Quan Wang](https://github.com/wq2012), with his explicit permission to use this code here.
34  The copyright (C) of the implementation is held by Quan Wang, 2013.
35  We adapted the implementation to our genesis::utils::Matrix class and changed the error reporting
36  mechanism to exceptions. For further details, see the Acknowledgements section of the documentation.
37 */
38 
40 
47 
48 #include <algorithm>
49 #include <cassert>
50 #include <cmath>
51 #include <numeric>
52 #include <stdexcept>
53 
54 namespace genesis {
55 namespace utils {
56 
57 // ================================================================================================
58 // MDS Algorithms
59 // ================================================================================================
60 
61 constexpr double MDS_EPSILON = 0.0000001;
62 
64  Matrix<double> const& distances,
65  Matrix<double> const& initial_values,
66  size_t dimensions,
67  size_t iterations
68 ) {
69  // This function is local, and we already checked the conditions below.
70  // Thus, just assert them here again.
71  assert( is_square( distances ));
72  assert( dimensions >= 1 );
73  assert( iterations >= 1 );
74  assert( initial_values.rows() == distances.rows() );
75  assert( initial_values.cols() == dimensions );
76 
77  // Algorithm constants
78  double const learning_rate = 0.05;
79  double const r_metric = 2.0;
80  size_t const n = distances.rows();
81 
82  // Resutl matrix
83  auto result = initial_values;
84  assert( result.rows() == n );
85  assert( result.cols() == dimensions );
86 
87  // Temporary matrices
88  auto dh = Matrix<double>( n, n, 0.0 );
89  auto pmat = Matrix<double>( n, dimensions, 0.0 );
90  auto dhdum = std::vector<double>( n, 0.0 );
91  auto dhmat = Matrix<double>( n - 1, dimensions, 0.0 );
92 
93  // Make a matrix with column-wise permutations of consecutive numbers.
94  auto perms = Matrix<size_t>( n, iterations );
95  for( size_t j = 0; j < perms.cols(); ++j ) {
96  std::iota( perms.col(j).begin(), perms.col(j).end(), 0 );
97  std::shuffle( perms.col(j).begin(), perms.col(j).end(), Options::get().random_engine() );
98  }
99 
100  // Run the iterations.
101  for( size_t it = 0; it < iterations; ++it ) {
102 
103  // Work on each vector in a randomly permuted order
104  for( size_t rp = 0; rp < n; ++rp ) {
105  auto const m = perms( rp, it );
106  assert( m < n );
107 
108  for( size_t i = 0; i < n; ++i ) {
109  for( size_t j = 0; j < dimensions; ++j ) {
110  pmat( i, j ) = result( m, j ) - result( i, j );
111  }
112  }
113  for( size_t i = 0; i < n; ++i ) {
114  double temp = 0.0;
115  for( size_t j = 0; j < dimensions; ++j ) {
116  temp += std::pow( std::abs( pmat( i, j )), r_metric );
117  }
118  dhdum[ i ] = std::pow( temp, 1.0 / r_metric );
119  }
120  for( size_t i = 0; i < n; ++i ) {
121  if( i == m ) {
122  continue;
123  }
124  dh( m, i ) = dhdum[ i ];
125  dh( i, m ) = dhdum[ i ];
126  }
127  for( size_t i = 0; i < n - 1; ++i ) {
128  size_t const ii = ( i < m ) ? i : i + 1;
129  double const temp
130  = learning_rate
131  * ( dhdum[ ii ] - distances( ii, m ) )
132  * std::pow( dhdum[ ii ], 1.0 - r_metric )
133  ;
134  for( size_t j = 0; j < dimensions; ++j ) {
135  dhmat( i, j ) = temp;
136  }
137  }
138  for( size_t i = 0; i < n - 1; ++i ) {
139  size_t const ii = ( i < m ) ? i : i + 1;
140  for( size_t j = 0; j < dimensions; ++j ) {
141  double temp = result( ii, j );
142  temp += dhmat( i, j )
143  * std::pow( std::abs( pmat( ii, j )), r_metric - 1.0 )
144  * signum<double>( pmat( ii, j ))
145  ;
146  result( ii, j ) = temp;
147  }
148  }
149  }
150  }
151 
152  return result;
153 }
154 
156  Matrix<double> const& distances,
157  Matrix<double> const& initial_values,
158  size_t dimensions,
159  size_t iterations
160 ) {
161  // This function is local, and we already checked the conditions below.
162  // Thus, just assert them here again.
163  assert( is_square( distances ));
164  assert( dimensions >= 1 );
165  assert( iterations >= 1 );
166  assert( initial_values.rows() == distances.rows() );
167  assert( initial_values.cols() == dimensions );
168  (void) dimensions;
169 
170  // Prepare result and a copy of it for the updating process.
171  auto result = initial_values;
172  auto previous = initial_values;
173 
174  // Get initial interpoint distances of the initial values.
175  auto idists = euclidean_distance_matrix( result );
176  assert( idists.rows() == distances.rows() );
177  assert( idists.cols() == distances.cols() );
178 
179  // Init empty stress matrix.
180  auto stress = Matrix<double>( distances.rows(), distances.cols() );
181 
182  // Run the iterations.
183  for( size_t it = 0; it < iterations; ++it ) {
184 
185  // Calcualte stress.
186  for( size_t i = 0; i < distances.rows(); ++i ) {
187  for( size_t j = 0; j < distances.cols(); ++j ) {
188  if( i == j || std::abs( idists(i,j)) < MDS_EPSILON ) {
189  stress( i, j ) = 0.0;
190  } else {
191  stress( i, j ) = - distances( i, j ) / idists( i, j );
192  }
193  }
194  }
195 
196  // Calculate stress diagonal.
197  for( size_t j = 0; j < distances.cols(); ++j ) {
198  double temp = 0.0;
199  for( size_t i = 0; i < distances.rows(); ++i ) {
200  temp += stress( i, j );
201  }
202  stress( j, j ) = - temp;
203  }
204 
205  // Update result.
206  assert( stress.rows() == result.rows() );
207  assert( previous.rows() == stress.cols() );
208  assert( previous.cols() == result.cols() );
209  for( size_t i = 0; i < result.rows(); ++i ) {
210  for( size_t j = 0; j < result.cols(); ++j ) {
211  double temp = 0.0;
212  for( size_t k = 0; k < stress.cols(); ++k ) {
213  temp += stress( i, k ) * previous( k, j );
214  }
215  result( i, j ) = temp / static_cast<double>( distances.rows() );
216  }
217  }
218 
219  // Update intermediate data.
220  idists = euclidean_distance_matrix( result );
221  previous = result;
222  }
223 
224  return result;
225 }
226 
227 // ================================================================================================
228 // MDS API Functions
229 // ================================================================================================
230 
232  Matrix<double> const& distances,
233  size_t dimensions,
234  size_t iterations,
235  MdsAlgorithm algorithm
236 ) {
237  // We skip all error checks here, because they will be done in the other function anyway.
238 
239  // Make a random init matrix in the range -0.5 to 0.5, and get the mean of the values
240  // as if they were in the range 0.0 to 1.0. We need this for proper normalization.
241  auto initial = Matrix<double>( distances.rows(), dimensions );
242  auto& engine = Options::get().random_engine();
243  auto distrib = std::uniform_real_distribution<double>( 0.0, 1.0 );
244  double mean = 0.0;
245  for( auto& e : initial ) {
246  auto const r = distrib( engine );
247  mean += r;
248  e = r - 0.5;
249  }
250  mean /= initial.size();
251  assert( 0.0 <= mean && mean <= 1.0 );
252 
253  // Normalize using the mean.
254  for( auto& e : initial ) {
255  e *= 0.1 * mean / ( 1.0 / 3.0 * std::sqrt( static_cast<double>( dimensions )));
256  }
257 
258  // Run the algo.
259  return multi_dimensional_scaling( distances, initial, dimensions, iterations, algorithm );
260 }
261 
263  Matrix<double> const& distances,
264  Matrix<double> const& initial_values,
265  size_t dimensions,
266  size_t iterations,
267  MdsAlgorithm algorithm
268 ) {
269  if( ! is_square( distances )) {
270  throw std::invalid_argument( "MDS input distance matrix is not square." );
271  }
272  if( dimensions < 1 ) {
273  throw std::invalid_argument( "MDS dimensions has to be >= 1." );
274  }
275  if( iterations < 1 ) {
276  throw std::invalid_argument( "MDS number of iterations has to be >= 1." );
277  }
278  if( initial_values.rows() != distances.rows() || initial_values.cols() != dimensions ) {
279  throw std::invalid_argument( "MDS initial values matrix has invalid dimensions." );
280  }
281  if( distances.empty() ) {
282  return {};
283  }
284 
285  switch( algorithm ) {
286  case MdsAlgorithm::kUcf: {
288  distances, initial_values, dimensions, iterations
289  );
290  }
291  case MdsAlgorithm::kSmacof: {
293  distances, initial_values, dimensions, iterations
294  );
295  }
296  default: {
297  assert( false );
298  }
299  }
300  return {};
301 }
302 
303 } // namespace utils
304 } // namespace genesis
Provides some valuable algorithms that are not part of the C++ 11 STL.
static Matrix< double > multi_dimensional_scaling_ucf(Matrix< double > const &distances, Matrix< double > const &initial_values, size_t dimensions, size_t iterations)
Definition: mds.cpp:63
Use the UCF implementation (recommended).
constexpr double MDS_EPSILON
Definition: mds.cpp:61
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
bool is_square(Matrix< T > const &data)
Return whether a Matrix is a square matrix, that is, whether its number of rows and number of columns...
static Matrix< double > multi_dimensional_scaling_smacof(Matrix< double > const &distances, Matrix< double > const &initial_values, size_t dimensions, size_t iterations)
Definition: mds.cpp:155
Matrix operators.
double mean(const Histogram &h)
Compute the bin-weighted arithmetic mean.
MatrixCol< self_type, value_type > col(size_t col)
MdsAlgorithm
Choice of algorithm to use for Multi-Dimensional Scaling (MDS).
Definition: mds.hpp:52
Matrix< double > euclidean_distance_matrix(Matrix< double > const &data)
Calculate the pairwise euclidean distance matrix between the rows of a given matrix.
Definition: distance.cpp:110
Matrix< double > multi_dimensional_scaling(Matrix< double > const &distances, size_t dimensions, size_t iterations, MdsAlgorithm algorithm)
Multi-Dimensional Scaling (MDS).
Definition: mds.cpp:231
std::default_random_engine & random_engine()
Returns the default engine for random number generation.
Definition: options.hpp:158
Use the SMACOF implementation.
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:60