A library for working with phylogenetic and population genetic data.
v0.27.0
statistics.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_MATH_STATISTICS_H_
2 #define GENESIS_UTILS_MATH_STATISTICS_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2021 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cmath>
41 #include <cstddef>
42 #include <functional>
43 #include <limits>
44 #include <stdexcept>
45 #include <utility>
46 #include <vector>
47 
48 namespace genesis {
49 namespace utils {
50 
51 // =================================================================================================
52 // Static Assersions
53 // =================================================================================================
54 
55 // We need to make sure that doubles and their infinities behave the way we expect!
56 
57 static_assert(
58  std::numeric_limits<double>::is_iec559,
59  "IEC 559/IEEE 754 floating-point types required (wrong double type)."
60 );
61 static_assert(
62  std::numeric_limits<double>::has_infinity,
63  "IEC 559/IEEE 754 floating-point types required (does not have infinity)."
64 );
65 static_assert(
66  std::numeric_limits<double>::has_quiet_NaN,
67  "IEC 559/IEEE 754 floating-point types required (does not have quite NaN)."
68 );
69 static_assert(
70  - std::numeric_limits<double>::infinity() < std::numeric_limits<double>::lowest(),
71  "IEC 559/IEEE 754 floating-point types required (infinity is not the lowest value)."
72 );
73 
74 // Clang fails to compile the following assertions, because of missing const expr markers
75 // in their std lib implementation. We hence need to skip those tests for clang :-(
76 // Hopefully, the above assertions are enough to cover the basics.
77 
78 #ifndef __clang__
79 
80 static_assert(
81  std::isinf( - std::numeric_limits<double>::infinity() ),
82  "IEC 559/IEEE 754 floating-point types required (infinity is not working properly)."
83 );
84 static_assert(
85  std::isinf( -1 * std::numeric_limits<double>::infinity()),
86  "IEC 559/IEEE 754 floating-point types required."
87 );
88 static_assert(
89  -1 * std::numeric_limits<double>::infinity() < std::numeric_limits<double>::lowest(),
90  "IEC 559/IEEE 754 floating-point types required."
91 );
92 
93 #endif // __clang__
94 
95 // =================================================================================================
96 // Structures and Classes
97 // =================================================================================================
98 
104 template< typename T >
106 {
107  T min;
108  T max;
109 };
110 
118 {
119  double mean;
120  double stddev;
121 };
122 
127 struct Quartiles
128 {
129  double q0 = 0.0;
130  double q1 = 0.0;
131  double q2 = 0.0;
132  double q3 = 0.0;
133  double q4 = 0.0;
134 };
135 
136 // =================================================================================================
137 // Standard Helper Functions
138 // =================================================================================================
139 
149 template <class ForwardIterator>
150 std::pair<size_t, size_t> count_finite_elements( ForwardIterator first, ForwardIterator last )
151 {
152  // Prepare result.
153  size_t valid = 0;
154  size_t total = 0;
155 
156  // Iterate.
157  while( first != last ) {
158  if( std::isfinite( *first ) ) {
159  ++valid;
160  }
161  ++total;
162  ++first;
163  }
164 
165  return { valid, total };
166 }
167 
174 template <class ForwardIterator>
175 double finite_minimum( ForwardIterator first, ForwardIterator last )
176 {
177  // Prepare result.
178  double min = std::numeric_limits<double>::max();
179  size_t cnt = 0;
180 
181  // Iterate.
182  while( first != last ) {
183  if( std::isfinite( *first ) ) {
184  if( *first < min ) {
185  min = *first;
186  }
187  ++cnt;
188  }
189  ++first;
190  }
191 
192  // If there are no valid elements, return nan.
193  if( cnt == 0 ) {
194  return std::numeric_limits<double>::quiet_NaN();
195  }
196 
197  return min;
198 }
199 
206 template <class ForwardIterator>
207 double finite_maximum( ForwardIterator first, ForwardIterator last )
208 {
209  // Prepare result.
210  double max = std::numeric_limits<double>::lowest();
211  size_t cnt = 0;
212 
213  // Iterate.
214  while( first != last ) {
215  if( std::isfinite( *first ) ) {
216  if( *first > max ) {
217  max = *first;
218  }
219  ++cnt;
220  }
221  ++first;
222  }
223 
224  // If there are no valid elements, return nan.
225  if( cnt == 0 ) {
226  return std::numeric_limits<double>::quiet_NaN();
227  }
228 
229  return max;
230 }
231 
238 template <class ForwardIterator>
239 MinMaxPair<double> finite_minimum_maximum( ForwardIterator first, ForwardIterator last )
240 {
241  // Prepare result.
242  MinMaxPair<double> result;
243  result.min = std::numeric_limits<double>::max();
244  result.max = std::numeric_limits<double>::lowest();
245  size_t cnt = 0;
246 
247  // Iterate.
248  while( first != last ) {
249  if( std::isfinite( *first ) ) {
250  if( *first < result.min ) {
251  result.min = *first;
252  }
253  if( *first > result.max ) {
254  result.max = *first;
255  }
256  ++cnt;
257  }
258  ++first;
259  }
260 
261  // If there are no valid elements, return nan.
262  if( cnt == 0 ) {
263  result.min = std::numeric_limits<double>::quiet_NaN();
264  result.max = std::numeric_limits<double>::quiet_NaN();
265  }
266 
267  return result;
268 }
269 
270 // =================================================================================================
271 // Normalization and Compositional Data Analysis
272 // =================================================================================================
273 
287 template <class ForwardIterator>
288 void closure( ForwardIterator first, ForwardIterator last )
289 {
290  // Prepare result.
291  double sum = 0.0;
292  size_t cnt = 0;
293 
294  // Sum up elements.
295  auto it = first;
296  while( it != last ) {
297  if( std::isfinite( *it ) ) {
298  if( *it < 0.0 ) {
299  throw std::invalid_argument(
300  "Cannot calculate closure of negative numbers."
301  );
302  }
303 
304  sum += *it;
305  ++cnt;
306  }
307  ++it;
308  }
309 
310  // If there are no valid elements, return.
311  if( cnt == 0 ) {
312  return;
313  }
314 
315  // Make the closure.
316  it = first;
317  while( it != last ) {
318  if( std::isfinite( *it ) ) {
319  *it /= sum;
320  }
321  ++it;
322  }
323 }
324 
330 inline void closure( std::vector<double>& vec )
331 {
332  return closure( vec.begin(), vec.end() );
333 }
334 
335 // =================================================================================================
336 // Mean Stddev
337 // =================================================================================================
338 
357 template <class ForwardIterator>
358 MeanStddevPair mean_stddev( ForwardIterator first, ForwardIterator last, double epsilon = -1.0 )
359 {
360  // Prepare result.
361  MeanStddevPair result;
362  result.mean = 0.0;
363  result.stddev = 0.0;
364  size_t count = 0;
365 
366  // Sum up elements.
367  auto it = first;
368  while( it != last ) {
369  if( std::isfinite( *it ) ) {
370  result.mean += *it;
371  ++count;
372  }
373  ++it;
374  }
375 
376  // If there are no valid elements, return an all-zero result.
377  if( count == 0 ) {
378  return result;
379  }
380 
381  // Calculate mean.
382  result.mean /= static_cast<double>( count );
383 
384  // Calculate std dev.
385  it = first;
386  while( it != last ) {
387  if( std::isfinite( *it ) ) {
388  result.stddev += (( *it - result.mean ) * ( *it - result.mean ));
389  }
390  ++it;
391  }
392  assert( count > 0 );
393  result.stddev /= static_cast<double>( count );
394  result.stddev = std::sqrt( result.stddev );
395 
396  // The following in an inelegant (but usual) way to handle near-zero values,
397  // which later would cause a division by zero.
398  assert( result.stddev >= 0.0 );
399  if( result.stddev <= epsilon ){
400  result.stddev = 1.0;
401  }
402 
403  return result;
404 }
405 
413 inline MeanStddevPair mean_stddev( std::vector<double> const& vec, double epsilon = -1.0 )
414 {
415  return mean_stddev( vec.begin(), vec.end(), epsilon );
416 }
417 
418 // =================================================================================================
419 // Arithmetic Mean
420 // =================================================================================================
421 
437 template <class ForwardIterator>
438 double arithmetic_mean( ForwardIterator first, ForwardIterator last )
439 {
440  // Prepare result.
441  double mean = 0.0;
442  size_t count = 0;
443 
444  // Sum up elements.
445  auto it = first;
446  while( it != last ) {
447  if( std::isfinite( *it ) ) {
448  mean += *it;
449  ++count;
450  }
451  ++it;
452  }
453 
454  // If there are no valid elements, return an all-zero result.
455  if( count == 0 ) {
456  assert( mean == 0.0 );
457  return mean;
458  }
459 
460  // Calculate mean.
461  assert( count > 0 );
462  return mean / static_cast<double>( count );
463 }
464 
473 inline double arithmetic_mean( std::vector<double> const& vec )
474 {
475  return arithmetic_mean( vec.begin(), vec.end() );
476 }
477 
495 template <class ForwardIterator>
497  ForwardIterator first_value, ForwardIterator last_value,
498  ForwardIterator first_weight, ForwardIterator last_weight
499 ) {
500  double num = 0.0;
501  double den = 0.0;
502  size_t cnt = 0;
503 
504  // Multiply elements.
506  first_value, last_value,
507  first_weight, last_weight,
508  [&]( double value, double weight ){
509  if( weight < 0.0 ) {
510  throw std::invalid_argument(
511  "Cannot calculate weighted arithmetic mean with negative weights."
512  );
513  }
514 
515  num += weight * value;
516  den += weight;
517  ++cnt;
518  }
519  );
520 
521  // If there are no valid elements, return an all-zero result.
522  if( cnt == 0 ) {
523  return 0.0;
524  }
525  if( den == 0.0 ) {
526  throw std::invalid_argument(
527  "Cannot calculate weighted arithmetic mean with all weights being 0."
528  );
529  }
530 
531  // Return the result.
532  assert( cnt > 0 );
533  assert( den > 0.0 );
534  return ( num / den );
535 }
536 
548  std::vector<double> const& values,
549  std::vector<double> const& weights
550 ) {
551  return weighted_arithmetic_mean( values.begin(), values.end(), weights.begin(), weights.end() );
552 }
553 
554 // =================================================================================================
555 // Geometric Mean
556 // =================================================================================================
557 
573 template <class ForwardIterator>
574 double geometric_mean( ForwardIterator first, ForwardIterator last )
575 {
576  double sum = 0.0;
577  size_t count = 0;
578 
579  // Iterate elements. For numeric stability, we use sum of logs instead of products;
580  // otherwise, we run into overflows too quickly!
581  auto it = first;
582  while( it != last ) {
583  if( std::isfinite( *it ) ) {
584  if( *it <= 0.0 ) {
585  throw std::invalid_argument(
586  "Cannot calculate geometric mean of non-positive numbers."
587  );
588  }
589  sum += std::log( *it );
590  ++count;
591  }
592  ++it;
593  }
594 
595  // If there are no valid elements, return an all-zero result.
596  if( count == 0 ) {
597  return 0.0;
598  }
599 
600  // Return the result.
601  assert( count > 0 );
602  assert( std::isfinite( sum ));
603  return std::exp( sum / static_cast<double>( count ));
604 }
605 
613 inline double geometric_mean( std::vector<double> const& vec )
614 {
615  return geometric_mean( vec.begin(), vec.end() );
616 }
617 
648 template <class ForwardIterator>
650  ForwardIterator first_value, ForwardIterator last_value,
651  ForwardIterator first_weight, ForwardIterator last_weight
652 ) {
653  double num = 0.0;
654  double den = 0.0;
655  size_t cnt = 0;
656 
657  // Multiply elements.
659  first_value, last_value,
660  first_weight, last_weight,
661  [&]( double value, double weight ){
662  if( value <= 0.0 ) {
663  throw std::invalid_argument(
664  "Cannot calculate weighted geometric mean of non-positive values."
665  );
666  }
667  if( weight < 0.0 ) {
668  throw std::invalid_argument(
669  "Cannot calculate weighted geometric mean with negative weights."
670  );
671  }
672 
673  num += weight * std::log( value );
674  den += weight;
675  ++cnt;
676  }
677  );
678 
679  // If there are no valid elements, return an all-zero result.
680  if( cnt == 0 ) {
681  return 0.0;
682  }
683  if( den == 0.0 ) {
684  throw std::invalid_argument(
685  "Cannot calculate weighted geometric mean with all weights being 0."
686  );
687  }
688 
689  // Return the result.
690  assert( cnt > 0 );
691  assert( std::isfinite( num ));
692  assert( std::isfinite( den ) && ( den > 0.0 ));
693  return std::exp( num / den );
694 }
695 
707  std::vector<double> const& values,
708  std::vector<double> const& weights
709 ) {
710  return weighted_geometric_mean( values.begin(), values.end(), weights.begin(), weights.end() );
711 }
712 
713 // =================================================================================================
714 // Harmoic Mean
715 // =================================================================================================
716 
722 {
726  kThrow,
727 
731  kIgnore,
732 
740  kReturnZero,
741 
756 };
757 
774 template <class ForwardIterator>
776  ForwardIterator first, ForwardIterator last,
778 ) {
779  // Keep track of the total sum of inverses, the count of how many samples were used in total
780  // (this excludes non-finite data points), and the number of zero value found, which is only
781  // used with HarmonicMeanZeroPolicy::kCorrection
782  double sum = 0.0;
783  size_t count = 0;
784  size_t zeroes = 0;
785 
786  // Iterate elements. For numeric stability, we use sum of logs instead of products;
787  // otherwise, we run into overflows too quickly!
788  auto it = first;
789  while( it != last ) {
790  if( std::isfinite( *it ) ) {
791  if( *it < 0.0 ) {
792  throw std::invalid_argument(
793  "Cannot calculate harmonic mean of negative values."
794  );
795  }
796 
797  if( *it > 0.0 ) {
798  sum += 1.0 / static_cast<double>( *it );
799  ++count;
800  } else {
801  assert( *it == 0.0 );
802  switch( zero_policy ) {
804  throw std::invalid_argument(
805  "Zero value found when calculating harmonic mean."
806  );
807  }
809  // Do nothing.
810  break;
811  }
813  // If any value is zero, we do not need to finish the iteration.
814  return 0.0;
815  }
817  // Increment both counters, but do not add anything to the sum.
818  ++count;
819  ++zeroes;
820  break;
821  }
822  }
823  }
824  }
825  ++it;
826  }
827 
828  // If there are no valid elements, or all of them are zero, return an all-zero result.
829  if( count == 0 || count == zeroes ) {
830  return 0.0;
831  }
832  assert( count > 0 );
833  assert( count > zeroes );
834  assert( std::isfinite( sum ));
835 
836  // Return the result. We always compute the correction,
837  // which however does not alter the result if not used.
838  auto const correction = static_cast<double>( count - zeroes ) / static_cast<double>( count );
839  return correction * static_cast<double>( count - zeroes ) / sum;
840 }
841 
849 inline double harmonic_mean(
850  std::vector<double> const& vec,
852 ) {
853  return harmonic_mean( vec.begin(), vec.end(), zero_policy );
854 }
855 
883 template <class ForwardIterator>
885  ForwardIterator first_value, ForwardIterator last_value,
886  ForwardIterator first_weight, ForwardIterator last_weight,
888 ) {
889  // Keep track of the numerator (sum of all weights of positive values) and denominator
890  // (sum of weights divided by values) of the summation, as well as the sum of all weights
891  // (which can be different from the numerator, if there are zero values), the total number of
892  // values used, and the number of zero values found.
893  double weights = 0.0;
894  double num = 0.0;
895  double den = 0.0;
896  size_t count = 0;
897  size_t zeroes = 0;
898 
899  // Multiply elements, only considering finite ones.
901  first_value, last_value,
902  first_weight, last_weight,
903  [&]( double value, double weight ){
904  if( value < 0.0 ) {
905  throw std::invalid_argument(
906  "Cannot calculate weighted harmonic mean of negative values."
907  );
908  }
909  if( weight < 0.0 ) {
910  throw std::invalid_argument(
911  "Cannot calculate weighted harmonic mean with negative weights."
912  );
913  }
914  if( value > 0.0 ) {
915  weights += weight;
916  num += weight;
917  den += weight / static_cast<double>( value );
918  ++count;
919  } else {
920  assert( value == 0.0 );
921  switch( zero_policy ) {
923  throw std::invalid_argument(
924  "Zero value found when calculating weighted harmonic mean."
925  );
926  }
928  // Do nothing.
929  break;
930  }
933  // Increment the sum of all weights, so that zero values are contributing
934  // to the corrected result according to their weight, and increment
935  // both counters, but do not add anything to the sums.
936  // In case of the return zero policy, we use the zeroes counter as a flag
937  // indicating that we have found a zero value.
938  weights += weight;
939  ++count;
940  ++zeroes;
941  break;
942  }
943  }
944  }
945  }
946  );
947 
948  // If there are no valid elements, or all of them are zero, return an all-zero result.
949  if( count == 0 || count == zeroes ) {
950  return 0.0;
951  }
952  // For the return zero policy, if one of them is zero, return zero.
953  if( zero_policy == HarmonicMeanZeroPolicy::kReturnZero && zeroes > 0 ) {
954  return 0.0;
955  }
956  if( num == 0.0 || den == 0.0 ) {
957  throw std::invalid_argument(
958  "Cannot calculate weighted harmonic mean with all weights being 0."
959  );
960  }
961  if( zeroes == 0 ) {
962  (void) zeroes;
963  assert( weights == num );
964  }
965  assert( count > 0 );
966  assert( count > zeroes );
967  assert( weights >= num );
968  assert( std::isfinite( num ) && ( num > 0.0 ));
969  assert( std::isfinite( den ) && ( den > 0.0 ));
970  assert( std::isfinite( weights ) && ( weights > 0.0 ));
971 
972  // Return the result. We always compute the correction,
973  // which however does not alter the result if not used.
974  auto const correction = num / weights;
975  return correction * num / den;
976 }
977 
989  std::vector<double> const& values,
990  std::vector<double> const& weights,
992 ) {
993  return weighted_harmonic_mean(
994  values.begin(), values.end(),
995  weights.begin(), weights.end(),
996  zero_policy
997  );
998 }
999 
1000 // =================================================================================================
1001 // Median
1002 // =================================================================================================
1003 
1013 template <class RandomAccessIterator>
1014 double median( RandomAccessIterator first, RandomAccessIterator last )
1015 {
1016  // Checks.
1017  if( ! std::is_sorted( first, last )) {
1018  throw std::runtime_error( "Range has to be sorted for median calculation." );
1019  }
1020  auto const size = static_cast<size_t>( std::distance( first, last ));
1021  if( size == 0 ) {
1022  return 0.0;
1023  }
1024 
1025  // Even or odd size? Median is calculated differently.
1026  if( size % 2 == 0 ) {
1027 
1028  // Get the two middle positions.
1029  size_t pl = size / 2 - 1;
1030  size_t pu = size / 2;
1031  assert( pl < size && pu < size );
1032 
1033  return ( *(first + pl) + *(first + pu) ) / 2.0;
1034 
1035  } else {
1036 
1037  // Int division, rounds down. This is what we want.
1038  size_t p = size / 2;
1039  assert( p < size );
1040 
1041  return *(first + p);
1042  }
1043 }
1044 
1050 inline double median( std::vector<double> const& vec )
1051 {
1052  return median( vec.begin(), vec.end() );
1053 }
1054 
1055 // =================================================================================================
1056 // Quartiles
1057 // =================================================================================================
1058 
1065 template <class RandomAccessIterator>
1066 Quartiles quartiles( RandomAccessIterator first, RandomAccessIterator last )
1067 {
1068  // Prepare result.
1069  Quartiles result;
1070 
1071  // Checks.
1072  if( ! std::is_sorted( first, last )) {
1073  throw std::runtime_error( "Range has to be sorted for quartiles calculation." );
1074  }
1075  auto const size = static_cast<size_t>( std::distance( first, last ));
1076  if( size == 0 ) {
1077  return result;
1078  }
1079 
1080  // Set min, 50% and max.
1081  result.q0 = *first;
1082  result.q2 = median( first, last );
1083  result.q4 = *(first + size - 1);
1084 
1085  // Even or odd size? Quartiles are calculated differently.
1086  // This could be done shorter, but this way is more expressive.
1087  if( size % 2 == 0 ) {
1088 
1089  // Even: Split exaclty in halves.
1090  result.q1 = median( first, first + size / 2 );
1091  result.q3 = median( first + size / 2, first + size );
1092 
1093  } else {
1094 
1095  // Odd: Do not include the median value itself.
1096  result.q1 = median( first, first + size / 2 );
1097  result.q3 = median( first + size / 2 + 1, first + size );
1098  }
1099 
1100  return result;
1101 }
1102 
1108 inline Quartiles quartiles( std::vector<double> const& vec )
1109 {
1110  return quartiles( vec.begin(), vec.end() );
1111 }
1112 
1113 // =================================================================================================
1114 // Dispersion
1115 // =================================================================================================
1116 
1125 inline double coefficient_of_variation( MeanStddevPair const& ms )
1126 {
1127  return ms.stddev / ms.mean;
1128 }
1129 
1133 inline std::vector<double> coefficient_of_variation( std::vector<MeanStddevPair> const& ms )
1134 {
1135  auto res = std::vector<double>( ms.size() );
1136  for( size_t i = 0; i < ms.size(); ++i ) {
1137  res[ i ] = coefficient_of_variation( ms[i] );
1138  }
1139  return res;
1140 }
1141 
1151 inline double index_of_dispersion( MeanStddevPair const& ms )
1152 {
1153  return ms.stddev * ms.stddev / ms.mean;
1154 }
1155 
1159 inline std::vector<double> index_of_dispersion( std::vector<MeanStddevPair> const& ms )
1160 {
1161  auto res = std::vector<double>( ms.size() );
1162  for( size_t i = 0; i < ms.size(); ++i ) {
1163  res[ i ] = index_of_dispersion( ms[i] );
1164  }
1165  return res;
1166 }
1167 
1176 {
1177  return ( q.q3 - q.q1 ) / ( q.q3 + q.q1 );
1178 }
1179 
1183 inline std::vector<double> quartile_coefficient_of_dispersion( std::vector<Quartiles> const& q )
1184 {
1185  auto res = std::vector<double>( q.size() );
1186  for( size_t i = 0; i < q.size(); ++i ) {
1187  res[ i ] = quartile_coefficient_of_dispersion( q[i] );
1188  }
1189  return res;
1190 }
1191 
1192 // =================================================================================================
1193 // Correlation Coefficients
1194 // =================================================================================================
1195 
1208 template <class ForwardIteratorA, class ForwardIteratorB>
1210  ForwardIteratorA first_a, ForwardIteratorA last_a,
1211  ForwardIteratorB first_b, ForwardIteratorB last_b
1212 ) {
1213  // Calculate means.
1214  double mean_a = 0.0;
1215  double mean_b = 0.0;
1216  size_t count = 0;
1218  first_a, last_a,
1219  first_b, last_b,
1220  [&]( double val_a, double val_b ){
1221  mean_a += val_a;
1222  mean_b += val_b;
1223  ++count;
1224  }
1225  );
1226  if( count == 0 ) {
1227  return std::numeric_limits<double>::quiet_NaN();
1228  }
1229  assert( count > 0 );
1230  mean_a /= static_cast<double>( count );
1231  mean_b /= static_cast<double>( count );
1232 
1233  // Calculate PCC parts.
1234  double numerator = 0.0;
1235  double std_dev_a = 0.0;
1236  double std_dev_b = 0.0;
1238  first_a, last_a,
1239  first_b, last_b,
1240  [&]( double val_a, double val_b ){
1241  double const d1 = val_a - mean_a;
1242  double const d2 = val_b - mean_b;
1243  numerator += d1 * d2;
1244  std_dev_a += d1 * d1;
1245  std_dev_b += d2 * d2;
1246  }
1247  );
1248 
1249  // Calculate PCC, and assert that it is in the correct range
1250  // (or not a number, which can happen if the std dev is 0.0, e.g. in all-zero vectors).
1251  auto const pcc = numerator / ( std::sqrt( std_dev_a ) * std::sqrt( std_dev_b ) );
1252  assert(( -1.0 <= pcc && pcc <= 1.0 ) || ( ! std::isfinite( pcc ) ));
1253  return pcc;
1254 }
1255 
1262  std::vector<double> const& vec_a,
1263  std::vector<double> const& vec_b
1264 ) {
1266  vec_a.begin(), vec_a.end(), vec_b.begin(), vec_b.end()
1267  );
1268 }
1269 
1278 template <class RandomAccessIteratorA, class RandomAccessIteratorB>
1280  RandomAccessIteratorA first_a, RandomAccessIteratorA last_a,
1281  RandomAccessIteratorB first_b, RandomAccessIteratorB last_b
1282 ) {
1283  // Get cleaned results. We need to make these copies, as we need to calculate the fractional
1284  // ranking on them, which would change if we used our normal for_each_finite_pair here...
1285  auto const cleaned = finite_pairs( first_a, last_a, first_b, last_b );
1286 
1287  // Get the ranking of both vectors.
1288  auto ranks_a = ranking_fractional( cleaned.first );
1289  auto ranks_b = ranking_fractional( cleaned.second );
1290  assert( ranks_a.size() == ranks_b.size() );
1291 
1292  return pearson_correlation_coefficient( ranks_a, ranks_b );
1293 }
1294 
1301  std::vector<double> const& vec_a,
1302  std::vector<double> const& vec_b
1303 ) {
1305  vec_a.begin(), vec_a.end(), vec_b.begin(), vec_b.end()
1306  );
1307 }
1308 
1317 inline double fisher_transformation( double correlation_coefficient )
1318 {
1319  auto const r = correlation_coefficient;
1320  if( r < -1.0 || r > 1.0 ) {
1321  throw std::invalid_argument(
1322  "Cannot apply fisher transformation to value " + std::to_string( r ) +
1323  " outside of [ -1.0, 1.0 ]."
1324  );
1325  }
1326 
1327  // LOG_DBG << "formula " << 0.5 * log( ( 1.0 + r ) / ( 1.0 - r ) );
1328  // LOG_DBG << "simple " << std::atanh( r );
1329  return std::atanh( r );
1330 }
1331 
1337 inline std::vector<double> fisher_transformation( std::vector<double> const& correlation_coefficients )
1338 {
1339  auto res = correlation_coefficients;
1340  for( auto& elem : res ) {
1341  elem = fisher_transformation( elem );
1342  }
1343  return res;
1344 }
1345 
1346 } // namespace utils
1347 } // namespace genesis
1348 
1349 #endif // include guard
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
genesis::utils::arithmetic_mean
double arithmetic_mean(ForwardIterator first, ForwardIterator last)
Calculate the arithmetic mean of a range of numbers.
Definition: statistics.hpp:438
genesis::utils::Quartiles::q4
double q4
Definition: statistics.hpp:133
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::utils::pearson_correlation_coefficient
double pearson_correlation_coefficient(ForwardIteratorA first_a, ForwardIteratorA last_a, ForwardIteratorB first_b, ForwardIteratorB last_b)
Calculate the Pearson Correlation Coefficient between two ranges of double.
Definition: statistics.hpp:1209
ranking.hpp
genesis::utils::MeanStddevPair::mean
double mean
Definition: statistics.hpp:119
genesis::utils::Quartiles::q1
double q1
Definition: statistics.hpp:130
common.hpp
genesis::utils::geometric_mean
double geometric_mean(ForwardIterator first, ForwardIterator last)
Calculate the geometric mean of a range of positive numbers.
Definition: statistics.hpp:574
genesis::utils::ranking_fractional
std::vector< double > ranking_fractional(RandomAccessIterator first, RandomAccessIterator last)
Return the ranking of the values in the given range, using Fractional ranking ("1 2....
Definition: ranking.hpp:260
genesis::utils::HarmonicMeanZeroPolicy::kThrow
@ kThrow
Throw an exception when a zero value is encountered in the data.
genesis::utils::HarmonicMeanZeroPolicy::kReturnZero
@ kReturnZero
If any zero value is encountered in the data, simply return zero as the harmonic mean.
genesis::utils::finite_maximum
double finite_maximum(ForwardIterator first, ForwardIterator last)
Return the maximum of a range of double values.
Definition: statistics.hpp:207
genesis::utils::HarmonicMeanZeroPolicy
HarmonicMeanZeroPolicy
Select a policy on how to treat zeroes in the computation of harmonic_mean() and weighted_harmonic_me...
Definition: statistics.hpp:721
genesis::utils::HarmonicMeanZeroPolicy::kIgnore
@ kIgnore
Ignore any zero values.
genesis::utils::count_finite_elements
std::pair< size_t, size_t > count_finite_elements(ForwardIterator first, ForwardIterator last)
Count the number of finite elements in a range of double values.
Definition: statistics.hpp:150
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
genesis::utils::Quartiles::q0
double q0
Definition: statistics.hpp:129
genesis::utils::weighted_geometric_mean
double weighted_geometric_mean(ForwardIterator first_value, ForwardIterator last_value, ForwardIterator first_weight, ForwardIterator last_weight)
Calculate the weighted geometric mean of a range of positive numbers.
Definition: statistics.hpp:649
genesis::utils::HarmonicMeanZeroPolicy::kCorrection
@ kCorrection
Apply a zero-value correction.
genesis::utils::finite_pairs
std::pair< std::vector< double >, std::vector< double > > finite_pairs(ForwardIteratorA first_a, ForwardIteratorA last_a, ForwardIteratorB first_b, ForwardIteratorB last_b)
Helper function that cleans two ranges of double of the same length from non-finite values.
Definition: common.hpp:276
genesis::utils::spearmans_rank_correlation_coefficient
double spearmans_rank_correlation_coefficient(RandomAccessIteratorA first_a, RandomAccessIteratorA last_a, RandomAccessIteratorB first_b, RandomAccessIteratorB last_b)
Calculate Spearman's Rank Correlation Coefficient between two ranges of double.
Definition: statistics.hpp:1279
genesis::utils::weighted_harmonic_mean
double weighted_harmonic_mean(ForwardIterator first_value, ForwardIterator last_value, ForwardIterator first_weight, ForwardIterator last_weight, HarmonicMeanZeroPolicy zero_policy=HarmonicMeanZeroPolicy::kThrow)
Calculate the weighted harmonic mean of a range of positive numbers.
Definition: statistics.hpp:884
genesis::utils::quartiles
Quartiles quartiles(RandomAccessIterator first, RandomAccessIterator last)
Calculate the Quartiles of a sorted range of double values.
Definition: statistics.hpp:1066
genesis::utils::mean_stddev
MeanStddevPair mean_stddev(ForwardIterator first, ForwardIterator last, double epsilon=-1.0)
Calculate the arithmetic mean and standard deviation of a range of double elements.
Definition: statistics.hpp:358
genesis::utils::MeanStddevPair::stddev
double stddev
Definition: statistics.hpp:120
genesis::utils::harmonic_mean
double harmonic_mean(ForwardIterator first, ForwardIterator last, HarmonicMeanZeroPolicy zero_policy=HarmonicMeanZeroPolicy::kThrow)
Calculate the harmonic mean of a range of positive numbers.
Definition: statistics.hpp:775
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::fisher_transformation
double fisher_transformation(double correlation_coefficient)
Apply Fisher z-transformation to a correlation coefficient.
Definition: statistics.hpp:1317
genesis::utils::Quartiles::q3
double q3
Definition: statistics.hpp:132
genesis::utils::MinMaxPair::max
T max
Definition: statistics.hpp:108
genesis::utils::mean
double mean(const Histogram &h)
Compute the bin-weighted arithmetic mean.
Definition: utils/math/histogram/stats.cpp:91
genesis::utils::index_of_dispersion
double index_of_dispersion(MeanStddevPair const &ms)
Calculate the index of dispersion.
Definition: statistics.hpp:1151
genesis::utils::closure
void closure(ForwardIterator first, ForwardIterator last)
Calculate the closure of a range of numbers.
Definition: statistics.hpp:288
genesis::utils::coefficient_of_variation
double coefficient_of_variation(MeanStddevPair const &ms)
Calculate the index of dispersion.
Definition: statistics.hpp:1125
genesis::utils::MinMaxPair
Store a pair of min and max values.
Definition: statistics.hpp:105
genesis::utils::quartile_coefficient_of_dispersion
double quartile_coefficient_of_dispersion(Quartiles const &q)
Calculate the quartile_coefficient_of_dispersion.
Definition: statistics.hpp:1175
genesis::utils::Quartiles::q2
double q2
Definition: statistics.hpp:131
genesis::utils::finite_minimum_maximum
MinMaxPair< double > finite_minimum_maximum(ForwardIterator first, ForwardIterator last)
Return the minimum and the maximum of a range of double values.
Definition: statistics.hpp:239
genesis::utils::weighted_arithmetic_mean
double weighted_arithmetic_mean(ForwardIterator first_value, ForwardIterator last_value, ForwardIterator first_weight, ForwardIterator last_weight)
Calculate the weighted arithmetic mean of a range of double values.
Definition: statistics.hpp:496
genesis::utils::MeanStddevPair
Store a mean and a standard deviation value.
Definition: statistics.hpp:117
genesis::utils::Quartiles
Store the values of quartiles: q0 == min, q1 == 25%, q2 == 50%, q3 == 75%, q4 == max.
Definition: statistics.hpp:127
genesis::utils::finite_minimum
double finite_minimum(ForwardIterator first, ForwardIterator last)
Return the minimum of a range of double values.
Definition: statistics.hpp:175
genesis::utils::for_each_finite_pair
void for_each_finite_pair(ForwardIteratorA first_a, ForwardIteratorA last_a, ForwardIteratorB first_b, ForwardIteratorB last_b, std::function< void(double value_a, double value_b)> execute)
Iterate two ranges of double values in parallel, and execute a function for each pair of values from ...
Definition: common.hpp:310
genesis::utils::MinMaxPair::min
T min
Definition: statistics.hpp:107
genesis::utils::median
double median(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:68