A library for working with phylogenetic and population genetic data.
v0.32.0
correlation.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_MATH_CORRELATION_H_
2 #define GENESIS_UTILS_MATH_CORRELATION_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cmath>
41 #include <cstddef>
42 #include <cstdint>
43 #include <functional>
44 #include <limits>
45 #include <stdexcept>
46 #include <string>
47 #include <utility>
48 #include <vector>
49 
50 namespace genesis {
51 namespace utils {
52 
53 // =================================================================================================
54 // Pearson Correlation Coefficient
55 // =================================================================================================
56 
57 
70 template <class ForwardIteratorA, class ForwardIteratorB>
72  ForwardIteratorA first_a, ForwardIteratorA last_a,
73  ForwardIteratorB first_b, ForwardIteratorB last_b
74 ) {
75  // Calculate means.
76  double mean_a = 0.0;
77  double mean_b = 0.0;
78  size_t count = 0;
80  first_a, last_a,
81  first_b, last_b,
82  [&]( double val_a, double val_b ){
83  mean_a += val_a;
84  mean_b += val_b;
85  ++count;
86  }
87  );
88  if( count == 0 ) {
89  return std::numeric_limits<double>::quiet_NaN();
90  }
91  assert( count > 0 );
92  mean_a /= static_cast<double>( count );
93  mean_b /= static_cast<double>( count );
94 
95  // Calculate PCC parts.
96  double numerator = 0.0;
97  double std_dev_a = 0.0;
98  double std_dev_b = 0.0;
100  first_a, last_a,
101  first_b, last_b,
102  [&]( double val_a, double val_b ){
103  double const d1 = val_a - mean_a;
104  double const d2 = val_b - mean_b;
105  numerator += d1 * d2;
106  std_dev_a += d1 * d1;
107  std_dev_b += d2 * d2;
108  }
109  );
110 
111  // Calculate PCC, and assert that it is in the correct range
112  // (or not a number, which can happen if the std dev is 0.0, e.g. in all-zero vectors).
113  auto const pcc = numerator / ( std::sqrt( std_dev_a ) * std::sqrt( std_dev_b ) );
114  assert(( -1.0 <= pcc && pcc <= 1.0 ) || ( ! std::isfinite( pcc ) ));
115  return pcc;
116 }
117 
124  std::vector<double> const& vec_a,
125  std::vector<double> const& vec_b
126 ) {
128  vec_a.begin(), vec_a.end(), vec_b.begin(), vec_b.end()
129  );
130 }
131 
132 // =================================================================================================
133 // Spearman's Correlation Coefficient
134 // =================================================================================================
135 
144 template <class RandomAccessIteratorA, class RandomAccessIteratorB>
146  RandomAccessIteratorA first_a, RandomAccessIteratorA last_a,
147  RandomAccessIteratorB first_b, RandomAccessIteratorB last_b
148 ) {
149  // Get cleaned results. We need to make these copies, as we need to calculate the fractional
150  // ranking on them, which would change if we used our normal for_each_finite_pair here...
151  auto const cleaned = finite_pairs( first_a, last_a, first_b, last_b );
152 
153  // Get the ranking of both vectors.
154  auto const ranks_a = ranking_fractional( cleaned.first );
155  auto const ranks_b = ranking_fractional( cleaned.second );
156  assert( ranks_a.size() == ranks_b.size() );
157 
158  return pearson_correlation_coefficient( ranks_a, ranks_b );
159 }
160 
167  std::vector<double> const& vec_a,
168  std::vector<double> const& vec_b
169 ) {
171  vec_a.begin(), vec_a.end(), vec_b.begin(), vec_b.end()
172  );
173 }
174 
175 // =================================================================================================
176 // Kendall Tau Correlation Coefficient
177 // =================================================================================================
178 
188 {
192  kTauA,
193 
197  kTauB,
198 
202  kTauC,
203 };
204 
218  std::vector<double> const& x,
219  std::vector<double> const& y,
221 );
222 
228 template <class InputIteratorA, class InputIteratorB>
230  InputIteratorA first_a, InputIteratorA last_a,
231  InputIteratorB first_b, InputIteratorB last_b,
233 ) {
234  // Use cleaned results with only finite values. We need those internally anyway to get proper
235  // ranking, and by doing it here already, we can save another copy of the data internally.
236  auto const cleaned = finite_pairs( first_a, last_a, first_b, last_b );
237  return kendalls_tau_correlation_coefficient( cleaned.first, cleaned.second, method );
238 }
239 
248  std::vector<double> const& x,
249  std::vector<double> const& y,
251 );
252 
253 // =================================================================================================
254 // Fisher z-transformation
255 // =================================================================================================
256 
265 inline double fisher_transformation( double correlation_coefficient )
266 {
267  auto const r = correlation_coefficient;
268  if( r < -1.0 || r > 1.0 ) {
269  throw std::invalid_argument(
270  "Cannot apply fisher transformation to value " + std::to_string( r ) +
271  " outside of [ -1.0, 1.0 ]."
272  );
273  }
274 
275  // LOG_DBG << "formula " << 0.5 * log( ( 1.0 + r ) / ( 1.0 - r ) );
276  // LOG_DBG << "simple " << std::atanh( r );
277  return std::atanh( r );
278 }
279 
285 inline std::vector<double> fisher_transformation( std::vector<double> const& correlation_coefficients )
286 {
287  auto res = correlation_coefficients;
288  for( auto& elem : res ) {
289  elem = fisher_transformation( elem );
290  }
291  return res;
292 }
293 
294 } // namespace utils
295 } // namespace genesis
296 
297 #endif // include guard
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
genesis::utils::pearson_correlation_coefficient
double pearson_correlation_coefficient(ForwardIteratorA first_a, ForwardIteratorA last_a, ForwardIteratorB first_b, ForwardIteratorB last_b)
Calculate the Pearson Correlation Coefficient between two ranges of double.
Definition: correlation.hpp:71
ranking.hpp
common.hpp
genesis::utils::ranking_fractional
std::vector< double > ranking_fractional(RandomAccessIterator first, RandomAccessIterator last)
Return the ranking of the values in the given range, using Fractional ranking ("1 2....
Definition: ranking.hpp:262
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::KendallsTauMethod::kTauC
@ kTauC
Compute Tau-c, (also called Stuart-Kendall Tau-c).
genesis::utils::finite_pairs
std::pair< std::vector< double >, std::vector< double > > finite_pairs(ForwardIteratorA first_a, ForwardIteratorA last_a, ForwardIteratorB first_b, ForwardIteratorB last_b)
Helper function that cleans two ranges of double of the same length from non-finite values.
Definition: common.hpp:252
genesis::utils::spearmans_rank_correlation_coefficient
double spearmans_rank_correlation_coefficient(RandomAccessIteratorA first_a, RandomAccessIteratorA last_a, RandomAccessIteratorB first_b, RandomAccessIteratorB last_b)
Calculate Spearman's Rank Correlation Coefficient between two ranges of double.
Definition: correlation.hpp:145
genesis::utils::kendalls_tau_correlation_coefficient_naive
double kendalls_tau_correlation_coefficient_naive(std::vector< double > const &x, std::vector< double > const &y, KendallsTauMethod method)
Compute a simple version of Kendall's Tau Correlation Coefficient.
Definition: correlation.cpp:468
genesis::utils::KendallsTauMethod::kTauA
@ kTauA
Compute Tau-a, which does not make any adjustment for ties.
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::KendallsTauMethod
KendallsTauMethod
Method for computing Kendall's Tau.
Definition: correlation.hpp:187
genesis::utils::fisher_transformation
double fisher_transformation(double correlation_coefficient)
Apply Fisher z-transformation to a correlation coefficient.
Definition: correlation.hpp:265
genesis::utils::kendalls_tau_correlation_coefficient
double kendalls_tau_correlation_coefficient(std::vector< double > const &x, std::vector< double > const &y, KendallsTauMethod method)
Compute Kendall's Tau Correlation Coefficient.
Definition: correlation.cpp:423
genesis::utils::KendallsTauMethod::kTauB
@ kTauB
Compute Tau-b, which does adjustments for ties.
genesis::utils::for_each_finite_pair
void for_each_finite_pair(ForwardIteratorA first_a, ForwardIteratorA last_a, ForwardIteratorB first_b, ForwardIteratorB last_b, std::function< void(double value_a, double value_b)> execute)
Iterate two ranges of double values in parallel, and execute a function for each pair of values from ...
Definition: common.hpp:286