A library for working with phylogenetic and population genetic data.
v0.32.0
correlation.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
35 
36 #include <algorithm>
37 #include <cmath>
38 #include <iostream>
39 #include <limits>
40 #include <numeric>
41 #include <unordered_map>
42 #include <vector>
43 
44 namespace genesis {
45 namespace utils {
46 
47 // =================================================================================================
48 // General Helper Functions
49 // =================================================================================================
50 
56  std::vector<double> const& x,
57  std::vector<double> const& y
58 ) {
59  // Collect all unique values, counting how often each of them occurs.
60  // We need to skip nan values here, as we also omit them in all other calculations.
61  std::unordered_map<double, size_t> unique_x;
62  std::unordered_map<double, size_t> unique_y;
63  assert( x.size() == y.size() );
64  for( size_t i = 0; i < x.size(); ++i ) {
65  if( std::isfinite(x[i]) && std::isfinite(y[i]) ) {
66  ++unique_x[x[i]];
67  ++unique_y[y[i]];
68  }
69  }
70  return std::min( unique_x.size(), unique_y.size() );
71 }
72 
81  std::vector<double> const& x, std::vector<double> const& y,
82  size_t const concordant, size_t const discordant,
83  size_t const n, size_t const n0, size_t const n1, size_t const n2, size_t const n3,
84  KendallsTauMethod method
85 ) {
86  double tau = std::numeric_limits<double>::quiet_NaN();
87  assert( x.size() == y.size() );
88 
89  // All invariants of the process. The last one is the important one,
90  // where all pairs need to be accounted for.
91  assert( n0 == n * (n - 1) / 2 );
92  assert( n1 <= n0 && n2 <= n0 );
93  assert( n3 <= n1 && n3 <= n2 );
94  assert( concordant <= n0 );
95  assert( discordant <= n0 );
96  assert( n0 == concordant + discordant + n1 + n2 - n3 );
97  (void) n3; // Only used for the assertion here.
98 
99  // Compute the numerator, common to all tau methods.
100  // We first cast to signed int, so that the difference computation is exact,
101  // and then cast the results to double, as we are doing a division next.
102  double const num = static_cast<int64_t>(concordant) - static_cast<int64_t>(discordant);
103 
104  switch( method ) {
106  double const den = concordant + discordant;
107  if( den != 0.0 ) {
108  tau = num / den;
109  }
110  break;
111  }
112 
114  // Compute the Tau-b denominator via differences in ints, so that they are exact,
115  // but then convert to double so that the multiplication does not overflow.
116  double const den = std::sqrt(
117  static_cast<double>(n0 - n1) * static_cast<double>(n0 - n2)
118  );
119  if( std::isfinite(den) && den != 0.0 ) {
120  tau = num / den;
121  }
122  break;
123  }
124 
126  // Minimum of the number of unique values in x and y.
127  double const m = kendalls_tau_count_tau_c_m_( x, y );
128  double const den = squared(n) * ( m - 1 ) / m;
129  if( std::isfinite(den) && den != 0.0 && m > 0 ) {
130  tau = 2.0 * num / den;
131  }
132  break;
133  }
134 
135  default: {
136  throw std::invalid_argument( "Invalid value for KendallsTauMethod" );
137  }
138  }
139 
140  assert( !std::isfinite(tau) || ( -1.0 <= tau && tau <= 1.0 ));
141  return tau;
142 }
143 
144 // =================================================================================================
145 // Kendall Tau using Knight's Algorithm
146 // =================================================================================================
147 
153  std::vector<double>& data, std::vector<double>& temp,
154  size_t left, size_t mid, size_t right
155 ) {
156  size_t i = left;
157  size_t j = mid;
158  size_t k = left;
159  size_t inversions = 0;
160 
161  // Merge sort, counting inversions (as if we were doing bubble sort)
162  while( i < mid && j <= right ) {
163  if( data[i] <= data[j] ) {
164  temp[k++] = data[i++];
165  } else {
166  temp[k++] = data[j++];
167  inversions += mid - i;
168  }
169  }
170 
171  // Copy the remaining elements
172  while( i < mid ) {
173  temp[k++] = data[i++];
174  }
175  while( j <= right ) {
176  temp[k++] = data[j++];
177  }
178 
179  // Copy back to the original vector
180  for( i = left; i <= right; ++i ) {
181  data[i] = temp[i];
182  }
183 
184  return inversions;
185 }
186 
192  std::vector<double>& data, std::vector<double>& temp, size_t left, size_t right
193 ) {
194  if( left >= right ) {
195  return 0;
196  }
197 
198  // Count the number of inversions done by merge sorting the list.
199  size_t const mid = left + (right - left) / 2;
200  size_t const inv_l = kendalls_tau_sort_and_count_( data, temp, left, mid );
201  size_t const inv_r = kendalls_tau_sort_and_count_( data, temp, mid + 1, right );
202  size_t const inv_m = kendalls_tau_merge_count_( data, temp, left, mid + 1, right );
203 
204  return inv_l + inv_r + inv_m;
205 }
206 
210 struct kendalls_tau_pair_hash_ {
211  // Simple hash for non-pair types
212  template <class T>
213  std::size_t operator () ( T const& value ) const {
214  return std::hash<T>{}(value);
215  }
216 
217  // Hash for pairs of other types
218  template <class T1, class T2>
219  std::size_t operator () ( std::pair<T1,T2> const& value ) const {
220  auto h1 = std::hash<T1>{}(value.first);
221  auto h2 = std::hash<T2>{}(value.second);
222  return combine_hashes( h1, h2 );
223  }
224 };
225 
229 template<typename T>
231  std::vector<T> const& values
232 ) {
233  // Collect all unique values, counting how often each of them occurs.
234  std::unordered_map<T, size_t, kendalls_tau_pair_hash_> unique_counts;
235  for( auto const& val : values ) {
236  ++unique_counts[val];
237  }
238 
239  // The number of ties for the purposes of the algorithm needs to account for the duplicates
240  // occurring in all combinations of pairs, so we use a triangular number.
241  size_t tie_sum = 0;
242  for( auto const& pair : unique_counts ) {
243  if( pair.second > 1 ) {
244  tie_sum += pair.second * (pair.second - 1) / 2;
245  }
246  }
247  return tie_sum;
248 }
249 
255  std::vector<double> const& values
256 ) {
257  assert( values.size() > 1 );
258 
259  // Find runs of equal numbers, and sum up the number of tied pairs that these correspond to.
260  size_t tie_sum = 0;
261  double cur_val = values[0];
262  size_t cur_cnt = 0;
263  for( size_t i = 0; i < values.size(); ++i ) {
264  assert( std::isfinite( values[i] ));
265  if( values[i] != cur_val ) {
266  assert( values[i] > cur_val );
267 
268  // We finished a range of equal numbers (or are at the end of the iteration).
269  // The number of ties for the purposes of the algorithm needs to account for the duplicates
270  // occurring in all combinations of pairs of tied numbers, so we use a triangular number.
271  tie_sum += cur_cnt * (cur_cnt - 1) / 2;
272 
273  // Reset count for the next value.
274  cur_val = values[i];
275  cur_cnt = 1;
276  } else {
277  // Otherwise, we are still in a range of equal numbers, so keep incrementing the counter.
278  ++cur_cnt;
279  }
280  }
281 
282  // We need a last addition for the last range of numbers.
283  tie_sum += cur_cnt * (cur_cnt - 1) / 2;
284  return tie_sum;
285 }
286 
292  std::vector<double> const& values,
293  std::vector<size_t> const& ranks
294 ) {
295  assert( values.size() == ranks.size() );
296  assert( values.size() > 1 );
297 
298  // Same algorithm as kendalls_tau_count_ties_sorted_(), see there for details.
299  // We unfortunatley have this bit of code duplication, as we here need to access the values
300  // through the sorting order given by the ranks. Doing that in a template or something would
301  // mean more levels of indication and all - and as this function was written as an optimization,
302  // that would kinda defy the purpose... So, code duplication it is.
303  size_t tie_sum = 0;
304  double cur_val = values[ranks[0]];
305  size_t cur_cnt = 0;
306  for( size_t i = 0; i < values.size(); ++i ) {
307  assert( std::isfinite( values[ranks[i]] ));
308  if( values[ranks[i]] != cur_val ) {
309  assert( values[ranks[i]] > cur_val );\
310  tie_sum += cur_cnt * (cur_cnt - 1) / 2;
311  cur_val = values[ranks[i]];
312  cur_cnt = 1;
313  } else {
314  ++cur_cnt;
315  }
316  }
317  tie_sum += cur_cnt * (cur_cnt - 1) / 2;
318  return tie_sum;
319 }
320 
326  std::vector<double> const& x,
327  std::vector<double> const& y,
328  std::vector<size_t> const& ranks
329 ) {
330  assert( x.size() == y.size() );
331  assert( x.size() == ranks.size() );
332  assert( x.size() > 1 );
333 
334  // Same algorithm again, see kendalls_tau_count_ties_sorted_rank_().
335  size_t tie_sum = 0;
336  double cur_val_x = x[ranks[0]];
337  double cur_val_y = y[ranks[0]];
338  size_t cur_cnt = 0;
339  for( size_t i = 0; i < x.size(); ++i ) {
340  assert( std::isfinite( x[ranks[i]] ));
341  assert( std::isfinite( y[ranks[i]] ));
342  if( x[ranks[i]] != cur_val_x || y[ranks[i]] != cur_val_y ) {
343  assert( x[ranks[i]] > cur_val_x || ( x[ranks[i]] == cur_val_x && y[ranks[i]] > cur_val_y ));
344  tie_sum += cur_cnt * (cur_cnt - 1) / 2;
345  cur_val_x = x[ranks[i]];
346  cur_val_y = y[ranks[i]];
347  cur_cnt = 1;
348  } else {
349  ++cur_cnt;
350  }
351  }
352  tie_sum += cur_cnt * (cur_cnt - 1) / 2;
353  return tie_sum;
354 }
355 
360  std::vector<double> const& x,
361  std::vector<double> const& y,
362  KendallsTauMethod method
363 ) {
364  // Basic checks.
365  assert( x.size() == y.size() );
366  if( x.size() < 2 ) {
367  return std::numeric_limits<double>::quiet_NaN();
368  }
369 
370  // We only count the discordant pairs as the number of inversions made in the merge sort above.
371  // To get the correct number of concordant pairs, we need to know the ties in x (called n1),
372  // the number of ties in y (called n2), and the number of ties in x _and_ y, called n3.
373  // We calculate all of them at different stages of this function, making use of the fact
374  // that our data is sorted by x and by y at points.
375 
376  // Create a vector of indices sorted by the corresponding values in x,
377  // breaking ties in x by secondary sort on y.
378  size_t const n = x.size();
379  std::vector<size_t> rank_x(n);
380  std::iota( rank_x.begin(), rank_x.end(), 0 );
381  std::sort( rank_x.begin(), rank_x.end(),
382  [&]( size_t i, size_t j ) {
383  return x[i] == x[j] ? y[i] < y[j] : x[i] < x[j];
384  }
385  );
386 
387  // The above raking means we have a sorting of x, which also serves as a sorting of pairs!
388  // We use this to compute n1 and n3 here.
389  auto const n1 = kendalls_tau_count_ties_sorted_rank_( x, rank_x );
390  auto const n3 = kendalls_tau_count_ties_sorted_pairs_rank_( x, y, rank_x );
391 
392  // Create a vector of y values sorted according to x
393  std::vector<double> sorted_y(n);
394  for( size_t i = 0; i < n; ++i ) {
395  sorted_y[i] = y[rank_x[i]];
396  }
397  rank_x.clear();
398 
399  // Use merge sort to count inversions in sorted_y, which are discordant pairs.
400  // We use a temporary vector for merge sort, to avoid re-allocating memory in each step.
401  std::vector<double> temp(n);
402  size_t const discordant = kendalls_tau_sort_and_count_( sorted_y, temp, 0, n - 1 );
403  assert( std::is_sorted( sorted_y.begin(), sorted_y.end() ));
404  temp.clear();
405 
406  // Now we have the list sorted by y, which we can use to compute n2.
407  auto const n2 = kendalls_tau_count_ties_sorted_( sorted_y );
408  sorted_y.clear();
409 
410  // We also compute n0 = total number of pairs.
411  size_t const n0 = n * (n - 1) / 2;
412  assert( n0 >= n1 && n0 >= n2 );
413 
414  // Now we can compute the number of concordant pairs.
415  size_t const concordant = n0 - n1 - n2 + n3 - discordant;
416  assert( concordant <= n0 );
417  assert( discordant <= n0 );
418 
419  // Compute the final value, using corrections as needed.
420  return kendalls_tau_method_( x, y, concordant, discordant, n, n0, n1, n2, n3, method );
421 }
422 
424  std::vector<double> const& x,
425  std::vector<double> const& y,
426  KendallsTauMethod method
427 ) {
428  // Errors and boundary cases
429  if( x.size() != y.size() ) {
430  throw std::invalid_argument(
431  "kendalls_tau_correlation_coefficient: Input with differing numbers of elements."
432  );
433  }
434 
435  // In the presence of nan values, we make a copy of the values, omitting pairs with nans.
436  // Maybe there is a way to avoid this, but Knight's algorithm uses rank sorting, which gets
437  // so much more complicated when values need to be skipped/masked... So for now, this is what
438  // we do for simplicity. At least, in the case without nan values, we can avoid the copy.
439  size_t nan_count = 0;
440  for( size_t i = 0; i < x.size(); ++i ) {
441  if( !std::isfinite(x[i]) || !std::isfinite(y[i]) ) {
442  ++nan_count;
443  }
444  }
445  if( nan_count > 0 ) {
446  assert( nan_count <= x.size() );
447  std::vector<double> x_clean;
448  std::vector<double> y_clean;
449  x_clean.reserve( x.size() - nan_count );
450  y_clean.reserve( x.size() - nan_count );
451  for( size_t i = 0; i < x.size(); ++i ) {
452  if( std::isfinite(x[i]) && std::isfinite(y[i]) ) {
453  x_clean.push_back(x[i]);
454  y_clean.push_back(y[i]);
455  }
456  }
457 
458  return kendalls_tau_correlation_coefficient_clean_( x_clean, y_clean, method );
459  } else {
460  return kendalls_tau_correlation_coefficient_clean_( x, y, method );
461  }
462 }
463 
464 // =================================================================================================
465 // Kendall Tau Naive Algorithm
466 // =================================================================================================
467 
469  std::vector<double> const& x,
470  std::vector<double> const& y,
471  KendallsTauMethod method
472 ) {
473  // Boundary checks.
474  if( x.size() != y.size() ) {
475  throw std::invalid_argument(
476  "kendalls_tau_correlation_coefficient: Input with differing numbers of elements."
477  );
478  }
479 
480  // Count number of pairs for the numerator.
481  size_t concordant = 0;
482  size_t discordant = 0;
483 
484  // n0 = total number of pairs, n1 = ties in x, n2 = ties in y, n3 = ties in both
485  size_t n0 = 0;
486  size_t n1 = 0;
487  size_t n2 = 0;
488  size_t n3 = 0;
489 
490  // We also count n, the size of the list that was used, i.e. excluding pairs that have nans.
491  size_t n = 0;
492 
493  // Iterate through all pairs of indices and accumulate concordant and discordant pairs.
494  for( size_t i = 0; i < x.size(); ++i ) {
495  if( std::isfinite(x[i]) && std::isfinite(y[i]) ) {
496  ++n;
497  } else {
498  continue;
499  }
500 
501  for( size_t j = i + 1; j < x.size(); ++j ) {
502  // Skip any pair with non-finite values.
503  if( !std::isfinite(x[j]) || !std::isfinite(y[j]) ) {
504  continue;
505  }
506 
507  // Get pair ordering.
508  double const dx = x[i] - x[j];
509  double const dy = y[i] - y[j];
510  assert( std::isfinite(dx) && std::isfinite(dy) );
511  ++n0;
512 
513  // Count concordances and ties.
514  if( dx == 0 && dy == 0 ) {
515  // Both pairs are ties
516  ++n1;
517  ++n2;
518  ++n3;
519  } else if( dx == 0 ) {
520  ++n1;
521  } else if( dy == 0 ) {
522  ++n2;
523  } else if( dx * dy > 0 ) {
524  ++concordant;
525  } else if( dx * dy < 0 ) {
526  ++discordant;
527  } else {
528  // We have exhausted all cases that can occur with finite values.
529  assert( false );
530  }
531  }
532  }
533 
534  // Compute the final value, using corrections as needed.
535  return kendalls_tau_method_( x, y, concordant, discordant, n, n0, n1, n2, n3, method );
536 }
537 
538 } // namespace utils
539 } // namespace genesis
genesis::utils::kendalls_tau_method_
double kendalls_tau_method_(std::vector< double > const &x, std::vector< double > const &y, size_t const concordant, size_t const discordant, size_t const n, size_t const n0, size_t const n1, size_t const n2, size_t const n3, KendallsTauMethod method)
Given counts of concordant and discordant pairs, compute the final value, applying the requested adju...
Definition: correlation.cpp:80
genesis::utils::combine_hashes
constexpr std::size_t combine_hashes(std::size_t h1, std::size_t h2)
Combine two hash values.
Definition: std.hpp:127
genesis::utils::kendalls_tau_merge_count_
size_t kendalls_tau_merge_count_(std::vector< double > &data, std::vector< double > &temp, size_t left, size_t mid, size_t right)
Helper function for kendalls_tau_correlation_coefficient() to do a merge sort that counts the number ...
Definition: correlation.cpp:152
common.hpp
genesis::utils::kendalls_tau_count_ties_sorted_rank_
size_t kendalls_tau_count_ties_sorted_rank_(std::vector< double > const &values, std::vector< size_t > const &ranks)
Helper function to count the number of tied pairs induced by equal values, on a sorted input list wit...
Definition: correlation.cpp:291
std.hpp
Provides some valuable additions to STD.
genesis::utils::kendalls_tau_count_ties_
size_t kendalls_tau_count_ties_(std::vector< T > const &values)
Helper function to count the number of tied pairs induced by equal values.
Definition: correlation.cpp:230
genesis::utils::KendallsTauMethod::kTauC
@ kTauC
Compute Tau-c, (also called Stuart-Kendall Tau-c).
genesis::utils::kendalls_tau_count_ties_sorted_pairs_rank_
size_t kendalls_tau_count_ties_sorted_pairs_rank_(std::vector< double > const &x, std::vector< double > const &y, std::vector< size_t > const &ranks)
Helper function to count the number of tied pairs induced by equal values, on two input list with the...
Definition: correlation.cpp:325
genesis::utils::kendalls_tau_correlation_coefficient_naive
double kendalls_tau_correlation_coefficient_naive(std::vector< double > const &x, std::vector< double > const &y, KendallsTauMethod method)
Compute a simple version of Kendall's Tau Correlation Coefficient.
Definition: correlation.cpp:468
genesis::utils::KendallsTauMethod::kTauA
@ kTauA
Compute Tau-a, which does not make any adjustment for ties.
genesis::utils::kendalls_tau_sort_and_count_
size_t kendalls_tau_sort_and_count_(std::vector< double > &data, std::vector< double > &temp, size_t left, size_t right)
Helper function for kendalls_tau_correlation_coefficient() to sort a list using merge sort,...
Definition: correlation.cpp:191
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::KendallsTauMethod
KendallsTauMethod
Method for computing Kendall's Tau.
Definition: correlation.hpp:187
genesis::utils::kendalls_tau_correlation_coefficient_clean_
double kendalls_tau_correlation_coefficient_clean_(std::vector< double > const &x, std::vector< double > const &y, KendallsTauMethod method)
Compute Kendall's Tau, expecting clean input without nan values, using Knight's algorithm.
Definition: correlation.cpp:359
genesis::utils::kendalls_tau_count_tau_c_m_
size_t kendalls_tau_count_tau_c_m_(std::vector< double > const &x, std::vector< double > const &y)
Helper function to count the number of unique values in both lists, using only those entris that are ...
Definition: correlation.cpp:55
genesis::utils::kendalls_tau_count_ties_sorted_
size_t kendalls_tau_count_ties_sorted_(std::vector< double > const &values)
Helper function to count the number of tied pairs induced by equal values, on a sorted input list.
Definition: correlation.cpp:254
correlation.hpp
genesis::utils::kendalls_tau_correlation_coefficient
double kendalls_tau_correlation_coefficient(std::vector< double > const &x, std::vector< double > const &y, KendallsTauMethod method)
Compute Kendall's Tau Correlation Coefficient.
Definition: correlation.cpp:423
genesis::utils::KendallsTauMethod::kTauB
@ kTauB
Compute Tau-b, which does adjustments for ties.
genesis::utils::squared
constexpr double squared(double x)
Square of a number.
Definition: common.hpp:223