A library for working with phylogenetic and population genetic data.
v0.32.0
fst_pool_karlsson.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_FST_POOL_KARLSSON_H_
2 #define GENESIS_POPULATION_FUNCTION_FST_POOL_KARLSSON_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
39 
40 #include <cassert>
41 #include <cmath>
42 #include <limits>
43 #include <stdexcept>
44 #include <tuple>
45 #include <utility>
46 #include <vector>
47 
48 namespace genesis {
49 namespace population {
50 
51 // =================================================================================================
52 // Fst Pool Calculator Karlsson
53 // =================================================================================================
54 
74 {
75 public:
76 
77  // -------------------------------------------------------------------------
78  // Constructors and Rule of Five
79  // -------------------------------------------------------------------------
80 
81  FstPoolCalculatorKarlsson() = default;
82 
87  FstPoolCalculatorKarlsson( size_t p1_poolsize, size_t p2_poolsize )
88  {
89  (void) p1_poolsize;
90  (void) p2_poolsize;
91  }
92 
93  virtual ~FstPoolCalculatorKarlsson() override = default;
94 
97 
100 
101  // -------------------------------------------------------------------------
102  // Abstract Members
103  // -------------------------------------------------------------------------
104 
105 private:
106 
107  void reset_() override
108  {
109  sum_nk_ = 0.0;
110  sum_dk_ = 0.0;
111  }
112 
113  void process_( SampleCounts const& p1, SampleCounts const& p2 ) override
114  {
115  // Get intermediate values and add them up.
116  auto const counts = sorted_average_sample_counts( p1, p2 );
117  auto const nkdk = f_st_pool_karlsson_nkdk( counts );
118  sum_nk_ += nkdk.first;
119  sum_dk_ += nkdk.second;
120  }
121 
122  double get_result_() const override
123  {
124  return static_cast<double>( sum_nk_ ) / static_cast<double>( sum_dk_ );
125  }
126 
127  // -------------------------------------------------------------------------
128  // Helper Functions
129  // -------------------------------------------------------------------------
130 
131 public:
132 
140  static std::pair<double, double> f_st_pool_karlsson_nkdk(
141  std::pair<SortedSampleCounts, SortedSampleCounts> const& sample_counts
142  ) {
143  // PoPoolation2 function: calculate_nk_dk
144  using namespace genesis::utils;
145 
146  // Error check. We only want biallelic SNPs, so we check that the smallest two values
147  // here are actually zero.
148  // Update: That does not quite work corrctly if we want to filter by min counts,
149  // in which cases we might have minor third and fourth alleles that are below the min count.
150  // We can safely ignore this here, as those are not taken into account below anyway.
151  // This is also now how PoPoolation2 handles this situation.
152  // if(
153  // sample_counts.first[2].count != 0 || sample_counts.first[3].count != 0 ||
154  // sample_counts.second[2].count != 0 || sample_counts.second[3].count != 0
155  // ) {
156  // throw std::runtime_error(
157  // "In f_st_pool_karlsson(): Expecting biallelic SNPs where only "
158  // "two nucleotide counts are > 0, but found more non-zero counts."
159  // );
160  // }
161  // assert( sample_counts.first[2].count == 0 && sample_counts.first[3].count == 0 );
162  // assert( sample_counts.second[2].count == 0 && sample_counts.second[3].count == 0 );
163 
164  // Assert that the bases are in the same order in both samples.
165  assert(
166  sample_counts.first[0].base == sample_counts.second[0].base &&
167  sample_counts.first[1].base == sample_counts.second[1].base &&
168  sample_counts.first[2].base == sample_counts.second[2].base &&
169  sample_counts.first[3].base == sample_counts.second[3].base
170  );
171 
172  // Get the major allele count (`a` here and in PoPoolation2),
173  // the minor allele count (`b` here, not used in PoPoolation2 under that name),
174  // and the total read depth (`n` here and in PoPoolation2, or nucleotide count).
175  auto const a_1 = static_cast<double>( sample_counts.first[0].count );
176  auto const b_1 = static_cast<double>( sample_counts.first[1].count );
177  auto const n_1 = a_1 + b_1;
178  auto const a_2 = static_cast<double>( sample_counts.second[0].count );
179  auto const b_2 = static_cast<double>( sample_counts.second[1].count );
180  auto const n_2 = a_2 + b_2;
181 
182  // Edge case
183  if( n_1 <= 1.0 || n_2 <= 1.0 ) {
184  return { 0.0, 0.0 };
185  }
186  assert( n_1 > 1.0 );
187  assert( n_2 > 1.0 );
188  assert( a_1 <= n_1 );
189  assert( a_2 <= n_2 );
190 
191  // PoPoolation2 functions: calculate_h1, calculate_h2
192  double const h1 = ( a_1 * b_1 ) / ( n_1 * ( n_1 - 1.0 ));
193  double const h2 = ( a_2 * b_2 ) / ( n_2 * ( n_2 - 1.0 ));
194 
195  // PoPoolation2 function: calculate_nk_dk
196  double const sqr = squared(( a_1 / n_1 ) - ( a_2 / n_2 ));
197  double const sub = ( h1 / n_1 + h2 / n_2 );
198  double const nk = sqr - sub;
199  double const dk = nk + h1 + h2;
200 
201  return { nk, dk };
202  }
203 
204  // -------------------------------------------------------------------------
205  // Private Member Variables
206  // -------------------------------------------------------------------------
207 
208 private:
209 
210  // Result values.
211  utils::NeumaierSum sum_nk_ = 0.0;
212  utils::NeumaierSum sum_dk_ = 0.0;
213 
214 };
215 
216 } // namespace population
217 } // namespace genesis
218 
219 #endif // include guard
functions.hpp
genesis::utils::CompensatedSum
Compensated summation algorithmm, such as Kahan, Neumaier, and Klein summation.
Definition: compensated_sum.hpp:49
common.hpp
std.hpp
Provides some valuable additions to STD.
genesis::utils
Definition: placement/formats/edge_color.hpp:42
fst_pool_calculator.hpp
genesis::population::FstPoolCalculatorKarlsson::FstPoolCalculatorKarlsson
FstPoolCalculatorKarlsson(size_t p1_poolsize, size_t p2_poolsize)
Constructor that takes dummy pool sizes that are not used. Offered to have the same interface as the ...
Definition: fst_pool_karlsson.hpp:87
genesis::population::FstPoolCalculatorKarlsson
Compute the F_ST statistic for pool-sequenced data of Karlsson et al as used in PoPoolation2,...
Definition: fst_pool_karlsson.hpp:73
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::FstPoolCalculatorKarlsson::~FstPoolCalculatorKarlsson
virtual ~FstPoolCalculatorKarlsson() override=default
genesis::population::BaseFstPoolCalculator
Base class to compute FST between two pooled samples, given two instances of SampleCounts.
Definition: fst_pool_calculator.hpp:65
genesis::population::sorted_average_sample_counts
std::pair< SortedSampleCounts, SortedSampleCounts > sorted_average_sample_counts(SampleCounts const &sample_a, SampleCounts const &sample_b)
Return the sorted base counts of both input samples, orderd by the average frequencies of the nucleot...
Definition: population/function/functions.cpp:164
genesis::population::FstPoolCalculatorKarlsson::FstPoolCalculatorKarlsson
FstPoolCalculatorKarlsson()=default
compensated_sum.hpp
genesis::population::FstPoolCalculatorKarlsson::operator=
FstPoolCalculatorKarlsson & operator=(FstPoolCalculatorKarlsson const &)=default
genesis::utils::squared
constexpr double squared(double x)
Square of a number.
Definition: common.hpp:223
genesis::population::FstPoolCalculatorKarlsson::f_st_pool_karlsson_nkdk
static std::pair< double, double > f_st_pool_karlsson_nkdk(std::pair< SortedSampleCounts, SortedSampleCounts > const &sample_counts)
Compute the numerator N_k and denominator D_k needed for the asymptotically unbiased F_ST estimator o...
Definition: fst_pool_karlsson.hpp:140