A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
common.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
33 
35 
36 #include <algorithm>
37 #include <cassert>
38 #include <cmath>
39 
40 namespace genesis {
41 namespace utils {
42 
43 // =================================================================================================
44 // Statistics
45 // =================================================================================================
46 
47 MeanStddevPair mean_stddev( std::vector<double> const& vec, double epsilon )
48 {
49  // Prepare result.
50  MeanStddevPair result;
51  result.mean = 0.0;
52  result.stddev = 0.0;
53 
54  // Calculate mean.
55  for( size_t i = 0; i < vec.size(); ++i ) {
56  result.mean += vec[i];
57  }
58  result.mean /= static_cast<double>( vec.size() );
59 
60  // Calculate column std dev.
61  for( size_t i = 0; i < vec.size(); ++i ) {
62  result.stddev += (( vec[i] - result.mean ) * ( vec[i] - result.mean ));
63  }
64  result.stddev /= static_cast<double>( vec.size() );
65  result.stddev = std::sqrt( result.stddev );
66 
67  // The following in an inelegant (but usual) way to handle near-zero values,
68  // which later would cause a division by zero.
69  assert( result.stddev >= 0.0 );
70  if( result.stddev <= epsilon ){
71  result.stddev = 1.0;
72  }
73 
74  return result;
75 }
76 
85 double median( std::vector<double> const& vec, size_t l, size_t r )
86 {
87  assert( l < vec.size() && r < vec.size() && l <= r );
88 
89  // Size of the interval.
90  size_t const sz = r - l + 1;
91 
92  // Even or odd size? Median is calculated differently.
93  if( sz % 2 == 0 ) {
94 
95  // Get the two middle positions.
96  size_t pl = l + sz / 2 - 1;
97  size_t pu = l + sz / 2;
98  assert( pl < vec.size() && pu < vec.size() );
99 
100  return ( vec[pl] + vec[pu] ) / 2.0;
101 
102  } else {
103 
104  // Int division, rounds down. This is what we want.
105  size_t p = l + sz / 2;
106  assert( p < vec.size() );
107 
108  return vec[p];
109  }
110 }
111 
112 double median( std::vector<double> const& vec )
113 {
114  // Checks.
115  if( ! std::is_sorted( vec.begin(), vec.end() )) {
116  throw std::runtime_error( "Vector has to be sorted for median calculation." );
117  }
118  if( vec.size() == 0 ) {
119  return 0.0;
120  }
121 
122  // Use local helper function, which takes the range inclusively.
123  return median( vec, 0, vec.size() - 1 );
124 }
125 
126 Quartiles quartiles( std::vector<double> const& vec )
127 {
128  // Prepare result.
129  Quartiles result;
130 
131  // Checks.
132  if( ! std::is_sorted( vec.begin(), vec.end() )) {
133  throw std::runtime_error( "Vector has to be sorted for quartiles calculation." );
134  }
135  if( vec.size() == 0 ) {
136  return result;
137  }
138 
139  // Set min, 50% and max.
140  result.q0 = vec[0];
141  result.q2 = median( vec, 0, vec.size() - 1 );
142  result.q4 = vec[ vec.size() - 1 ];
143 
144  // Even or odd size? Quartiles are calculated differently.
145  // This could be done shorter, but this way feels more expressive.
146  if( vec.size() % 2 == 0 ) {
147 
148  // Even: Split exaclty in halves.
149  result.q1 = median( vec, 0, vec.size() / 2 - 1 );
150  result.q3 = median( vec, vec.size() / 2, vec.size() - 1 );
151 
152  } else {
153 
154  // Odd: Do not include the median value itself.
155  result.q1 = median( vec, 0, vec.size() / 2 - 1 );
156  result.q3 = median( vec, vec.size() / 2 + 1, vec.size() - 1 );
157  }
158 
159  return result;
160 }
161 
163  std::vector<double> const& vec_a,
164  std::vector<double> const& vec_b
165 ) {
166  if( vec_a.size() != vec_b.size() ) {
167  throw std::runtime_error( "Vectors need to have same size." );
168  }
169 
170  // Calculate column means.
171  double mean1 = 0.0;
172  double mean2 = 0.0;
173  for( size_t i = 0; i < vec_a.size(); ++i ) {
174  mean1 += vec_a[ i ];
175  mean2 += vec_b[ i ];
176  }
177  mean1 /= static_cast<double>( vec_a.size() );
178  mean2 /= static_cast<double>( vec_b.size() );
179 
180  // Calculate PCC parts.
181  double numerator = 0.0;
182  double stddev1 = 0.0;
183  double stddev2 = 0.0;
184  for( size_t i = 0; i < vec_a.size(); ++i ) {
185  double const d1 = vec_a[ i ] - mean1;
186  double const d2 = vec_b[ i ] - mean2;
187  numerator += d1 * d2;
188  stddev1 += d1 * d1;
189  stddev2 += d2 * d2;
190  }
191 
192  // Calcualte PCC, and assert that it is in the correct range
193  // (or not a number, which can happen if the std dev is 0.0, e.g. in all-zero vectors).
194  auto const pcc = numerator / ( std::sqrt( stddev1 ) * std::sqrt( stddev2 ) );
195  assert(( -1.0 <= pcc && pcc <= 1.0 ) || ( ! std::isfinite( pcc ) ));
196  return pcc;
197 }
198 
200  std::vector<double> const& vec_a,
201  std::vector<double> const& vec_b
202 ) {
203  // Get the ranking of both vectors.
204  auto const ranks_a = ranking_fractional( vec_a );
205  auto const ranks_b = ranking_fractional( vec_b );
206 
207  return pearson_correlation_coefficient( ranks_a, ranks_b );
208 }
209 
210 double fisher_transformation( double correlation_coefficient )
211 {
212  auto const r = correlation_coefficient;
213  if( r < -1.0 || r > 1.0 ) {
214  throw std::invalid_argument(
215  "Cannot apply fisher transformation to value " + std::to_string( r ) +
216  " outside of [ -1.0, 1.0 ]."
217  );
218  }
219 
220  // LOG_DBG << "formula " << 0.5 * log( ( 1.0 + r ) / ( 1.0 - r ) );
221  // LOG_DBG << "simple " << std::atanh( r );
222  return std::atanh( r );
223 }
224 
225 std::vector<double> fisher_transformation( std::vector<double> const& correlation_coefficients )
226 {
227  auto res = correlation_coefficients;
228  for( auto& elem : res ) {
229  elem = fisher_transformation( elem );
230  }
231  return res;
232 }
233 
234 // =================================================================================================
235 // Ranking
236 // =================================================================================================
237 
238 std::vector<size_t> ranking_standard( std::vector<double> const& vec )
239 {
240  // Prepare result, and get the sorting order of the vector.
241  auto result = std::vector<size_t>( vec.size(), 1 );
242  auto const order = stable_sort_indices( vec.begin(), vec.end() );
243 
244  // Shortcuts for better readability.
245  auto ordered_value = [&]( size_t i ){
246  return vec[ order[i] ];
247  };
248  auto ordered_result = [&]( size_t i ) -> size_t& {
249  return result[ order[i] ];
250  };
251 
252  // Calculate ranks.
253  for( size_t i = 1; i < vec.size(); ++i ) {
254 
255  // Same values get the same rank. The next bigger one continues at the current i.
256  if( ordered_value( i ) == ordered_value( i - 1 ) ) {
257  ordered_result( i ) = ordered_result( i - 1 );
258  } else {
259  ordered_result( i ) = i + 1;
260  }
261  }
262 
263  return result;
264 }
265 
266 std::vector<size_t> ranking_modified( std::vector<double> const& vec )
267 {
268  // Prepare result, and get the sorting order of the vector.
269  auto result = std::vector<size_t>( vec.size(), 1 );
270  auto const order = stable_sort_indices( vec.begin(), vec.end() );
271 
272  // Shortcuts for better readability.
273  auto ordered_value = [&]( size_t i ){
274  return vec[ order[i] ];
275  };
276  auto ordered_result = [&]( size_t i ) -> size_t& {
277  return result[ order[i] ];
278  };
279 
280  // Calculate ranks. The loop variable is incremented at the end.
281  for( size_t i = 0; i < vec.size(); ) {
282 
283  // Look ahead: How often does the value occur?
284  size_t j = 1;
285  while( i+j < vec.size() && ordered_value(i+j) == ordered_value(i) ) {
286  ++j;
287  }
288 
289  // Set the j-next entries.
290  for( size_t k = 0; k < j; ++k ) {
291  ordered_result( i + k ) = i + j;
292  }
293 
294  // We can skip the j-next loop iterations, as we just set their values
295  i += j;
296  }
297 
298  return result;
299 }
300 
301 std::vector<size_t> ranking_dense( std::vector<double> const& vec )
302 {
303  // Prepare result, and get the sorting order of the vector.
304  auto result = std::vector<size_t>( vec.size(), 1 );
305  auto const order = stable_sort_indices( vec.begin(), vec.end() );
306 
307  // Shortcuts for better readability.
308  auto ordered_value = [&]( size_t i ){
309  return vec[ order[i] ];
310  };
311  auto ordered_result = [&]( size_t i ) -> size_t& {
312  return result[ order[i] ];
313  };
314 
315  // Calculate ranks.
316  for( size_t i = 1; i < vec.size(); ++i ) {
317 
318  // Same values get the same rank. The next bigger one continues by incrementing.
319  if( ordered_value( i ) == ordered_value( i - 1 ) ) {
320  ordered_result( i ) = ordered_result( i - 1 );
321  } else {
322  ordered_result( i ) = ordered_result( i - 1 ) + 1;
323  }
324  }
325 
326  return result;
327 }
328 
329 std::vector<size_t> ranking_ordinal( std::vector<double> const& vec )
330 {
331  // Prepare result, and get the sorting order of the vector.
332  auto result = std::vector<size_t>( vec.size(), 1 );
333  auto const order = stable_sort_indices( vec.begin(), vec.end() );
334 
335  // Shortcuts for better readability.
336  auto ordered_result = [&]( size_t i ) -> size_t& {
337  return result[ order[i] ];
338  };
339 
340  // Calculate ranks. This is simply the order plus 1 (as ranks are 1-based).
341  for( size_t i = 0; i < vec.size(); ++i ) {
342  ordered_result( i ) = i + 1;
343  }
344 
345  return result;
346 }
347 
348 std::vector<double> ranking_fractional( std::vector<double> const& vec )
349 {
350  // Prepare result, and get the sorting order of the vector.
351  auto result = std::vector<double>( vec.size(), 1 );
352  auto const order = stable_sort_indices( vec.begin(), vec.end() );
353 
354  // Shortcuts for better readability.
355  auto ordered_value = [&]( size_t i ){
356  return vec[ order[i] ];
357  };
358  auto ordered_result = [&]( size_t i ) -> double& {
359  return result[ order[i] ];
360  };
361 
362  // Calculate the average of the sum of numbers in the given inclusive range.
363  auto sum_avg = []( size_t l, size_t r )
364  {
365  assert( l <= r );
366 
367  // Example: l == 7, r == 9
368  // We want: (7 + 8 + 9) / 3 = 8.0
369  // Upper: 1+2+3+4+5+6+7+8+9 = 45
370  // Lower: 1+2+3+4+5+6 = 21
371  // Diff: 45 - 21 = 24
372  // Count: 9 - 7 + 1 = 3
373  // Result: 24 / 3 = 8
374  auto const upper = r * ( r + 1 ) / 2;
375  auto const lower = ( l - 1 ) * l / 2;
376  return static_cast<double>( upper - lower ) / static_cast<double>( r - l + 1 );
377  };
378 
379  // Calculate ranks. The loop variable is incremented at the end.
380  for( size_t i = 0; i < vec.size(); ) {
381 
382  // Look ahead: How often does the value occur?
383  size_t j = 1;
384  while( i+j < vec.size() && ordered_value(i+j) == ordered_value(i) ) {
385  ++j;
386  }
387 
388  // Set the j-next entries.
389  auto entry = sum_avg( i + 1, i + j );
390  for( size_t k = 0; k < j; ++k ) {
391  ordered_result( i + k ) = entry;
392  }
393 
394  // We can skip the j-next loop iterations, as we just set their values
395  i += j;
396  }
397 
398  return result;
399 }
400 
401 } // namespace utils
402 } // namespace genesis
std::vector< size_t > ranking_dense(std::vector< double > const &vec)
Return the ranking of the given values, using Dense ranking ("1223" ranking).
Definition: common.cpp:301
Provides some valuable algorithms that are not part of the C++ 11 STL.
double fisher_transformation(double correlation_coefficient)
Apply Fisher z-transformation to a correlation coefficient.
Definition: common.cpp:210
double spearmans_rank_correlation_coefficient(std::vector< double > const &vec_a, std::vector< double > const &vec_b)
Calculate Spearman's Rank Correlation Coefficient between the entries of two vectors.
Definition: common.cpp:199
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:300
double median(std::vector< double > const &vec, size_t l, size_t r)
Local helper function to get the median in between a range.
Definition: common.cpp:85
std::vector< size_t > ranking_ordinal(std::vector< double > const &vec)
Return the ranking of the given values, using Ordinal ranking ("1234" ranking).
Definition: common.cpp:329
double pearson_correlation_coefficient(std::vector< double > const &vec_a, std::vector< double > const &vec_b)
Calculate the Pearson Correlation Coefficient between the entries of two vectors. ...
Definition: common.cpp:162
MeanStddevPair mean_stddev(std::vector< double > const &vec, double epsilon)
Calcualte the mean and standard deviation of a vector of double.
Definition: common.cpp:47
Store a mean and a standard deviation value.
Definition: common.hpp:73
Provides easy and fast logging functionality.
std::vector< size_t > stable_sort_indices(RandomAccessIterator first, RandomAccessIterator last, Comparator comparator)
Get the indices to the stable sorted order of the given range.
Definition: algorithm.hpp:149
std::vector< double > ranking_fractional(std::vector< double > const &vec)
Return the ranking of the given values, using Fractional ranking ("1 2.5 2.5 4" ranking).
Definition: common.cpp:348
std::vector< size_t > ranking_standard(std::vector< double > const &vec)
Return the ranking of the given values, using Standard competition ranking ("1224" ranking)...
Definition: common.cpp:238
Quartiles quartiles(std::vector< double > const &vec)
Calculate the Quartiles of a vector of double.
Definition: common.cpp:126
std::vector< size_t > ranking_modified(std::vector< double > const &vec)
Return the ranking of the given values, using Modified competition ranking ("1334" ranking)...
Definition: common.cpp:266
Store the values of quartiles: q0 == min, q1 == 25%, q2 == 50%, q3 == 75%, q4 == max.
Definition: common.hpp:83