A library for working with phylogenetic and population genetic data.
v0.32.0
fst_pool_kofler.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_FST_POOL_KOFLER_H_
2 #define GENESIS_POPULATION_FUNCTION_FST_POOL_KOFLER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
38 
39 #include <cassert>
40 #include <cmath>
41 #include <limits>
42 #include <stdexcept>
43 #include <tuple>
44 #include <utility>
45 #include <vector>
46 
47 namespace genesis {
48 namespace population {
49 
50 // =================================================================================================
51 // Fst Pool Calculator Kofler
52 // =================================================================================================
53 
71 {
72 public:
73 
74  // -------------------------------------------------------------------------
75  // Constructors and Rule of Five
76  // -------------------------------------------------------------------------
77 
78  FstPoolCalculatorKofler( size_t p1_poolsize, size_t p2_poolsize )
79  : p1_poolsize_( static_cast<double>( p1_poolsize ))
80  , p2_poolsize_( static_cast<double>( p2_poolsize ))
81  {}
82 
83  virtual ~FstPoolCalculatorKofler() override = default;
84 
87 
90 
91  // -------------------------------------------------------------------------
92  // Abstract Members
93  // -------------------------------------------------------------------------
94 
95 private:
96 
97  void reset_() override
98  {
99  // Reset the internal counters, but not the pool sizes, so that the object can be reused.
100  p1_pi_sum_ = 0.0;
101  p2_pi_sum_ = 0.0;
102  pp_pi_sum_ = 0.0;
103  }
104 
105  void process_( SampleCounts const& p1, SampleCounts const& p2 ) override
106  {
107  // Edge and error cases. We will return nan anyway when finializing,
108  // so we can skip all the computation here.
109  if( p1_poolsize_ <= 1.0 || p2_poolsize_ <= 1.0 ) {
110  return;
111  }
112 
113  // Compute frequency based pi snps. The tuple returns p1, p2, pp, in that order.
114  auto const pi_snps = f_st_pool_kofler_pi_snp( p1, p2 );
115 
116  // Skip invalid entries than can happen when less than two of [ACGT] have counts > 0
117  // in one of the SampleCounts samples.
118  if(
119  std::isfinite( std::get<0>( pi_snps )) &&
120  std::isfinite( std::get<1>( pi_snps )) &&
121  std::isfinite( std::get<2>( pi_snps ))
122  ) {
123  // If we are here, both p1 and p2 have counts. Let's assert.
124  assert( p1.a_count + p1.c_count + p1.g_count + p1.t_count > 0 );
125  assert( p2.a_count + p2.c_count + p2.g_count + p2.t_count > 0 );
126 
127  // Now add them to the tally.
128  p1_pi_sum_ += std::get<0>( pi_snps );
129  p2_pi_sum_ += std::get<1>( pi_snps );
130  pp_pi_sum_ += std::get<2>( pi_snps );
131  } else {
132  // If we are here, at least one of the two inputs has one or fewer counts in [ACGT],
133  // otherwise, the results would have been finite. Let's assert this.
134  assert(
135  ( p1.a_count + p1.c_count + p1.g_count + p1.t_count <= 1 ) ||
136  ( p2.a_count + p2.c_count + p2.g_count + p2.t_count <= 1 )
137  );
138  }
139  }
140 
141  double get_result_() const override
142  {
143  // Edge and error cases.
144  if( p1_poolsize_ <= 1.0 || p2_poolsize_ <= 1.0 ) {
145  return std::numeric_limits<double>::quiet_NaN();
146  // throw std::invalid_argument( "Cannot run f_st_pool_kofler() with poolsizes <= 1" );
147  }
148 
149  // Normalize by poolsize
150  assert( p1_poolsize_ > 1.0 && p2_poolsize_ > 1.0 );
151  double const pp_poolsizem = std::min( p1_poolsize_, p2_poolsize_ );
152  auto const p1 = static_cast<double>( p1_pi_sum_ ) * p1_poolsize_ / ( p1_poolsize_ - 1.0 );
153  auto const p2 = static_cast<double>( p2_pi_sum_ ) * p2_poolsize_ / ( p2_poolsize_ - 1.0 );
154  auto const pp = static_cast<double>( pp_pi_sum_ ) * pp_poolsizem / ( pp_poolsizem - 1.0 );
155 
156  // _calculateFstValues
157  double const pp_avg = ( p1 + p2 ) / 2.0;
158  return ( pp - pp_avg ) / pp;
159  }
160 
161  // -------------------------------------------------------------------------
162  // Helper Functions
163  // -------------------------------------------------------------------------
164 
165 public:
166 
174  static std::tuple<double, double, double> f_st_pool_kofler_pi_snp(
175  SampleCounts const& p1, SampleCounts const& p2
176  ) {
177  using namespace genesis::utils;
178 
179  // _pi / _uncorrectedPiPerSNPFromFreqs
180  auto pi_snp_ = [](
181  double freq_a, double freq_c, double freq_g, double freq_t, double nt_cnt
182  ) {
183  double result = 1.0;
184  result -= squared( freq_a );
185  result -= squared( freq_c );
186  result -= squared( freq_g );
187  result -= squared( freq_t );
188  result *= nt_cnt / ( nt_cnt - 1.0 );
189  return result;
190  };
191 
192  // _calculateSNPFrequencies
193  // We cannot/do not want to simply call our heterozygosity() function here, as we need to
194  // re-use the frequencies anyway to compute their average, so we do everything at once here.
195 
196  // Get frequencies in sample 1
197  double const p1_nt_cnt = static_cast<double>( nucleotide_sum( p1 )); // eucov
198  double const p1_freq_a = static_cast<double>( p1.a_count ) / p1_nt_cnt;
199  double const p1_freq_c = static_cast<double>( p1.c_count ) / p1_nt_cnt;
200  double const p1_freq_g = static_cast<double>( p1.g_count ) / p1_nt_cnt;
201  double const p1_freq_t = static_cast<double>( p1.t_count ) / p1_nt_cnt;
202 
203  // Get frequencies in sample 2
204  double const p2_nt_cnt = static_cast<double>( nucleotide_sum( p2 )); // eucov
205  double const p2_freq_a = static_cast<double>( p2.a_count ) / p2_nt_cnt;
206  double const p2_freq_c = static_cast<double>( p2.c_count ) / p2_nt_cnt;
207  double const p2_freq_g = static_cast<double>( p2.g_count ) / p2_nt_cnt;
208  double const p2_freq_t = static_cast<double>( p2.t_count ) / p2_nt_cnt;
209 
210  // Compute their average
211  double const min_cnt = std::min( p1_nt_cnt, p2_nt_cnt );
212  double const avg_a = ( p1_freq_a + p2_freq_a ) / 2.0;
213  double const avg_c = ( p1_freq_c + p2_freq_c ) / 2.0;
214  double const avg_g = ( p1_freq_g + p2_freq_g ) / 2.0;
215  double const avg_t = ( p1_freq_t + p2_freq_t ) / 2.0;
216 
217  // _calculatePivalues / _pi / _uncorrectedPiPerSNPFromFreqs
218  auto const p1_pi = pi_snp_( p1_freq_a, p1_freq_c, p1_freq_g, p1_freq_t, p1_nt_cnt );
219  auto const p2_pi = pi_snp_( p2_freq_a, p2_freq_c, p2_freq_g, p2_freq_t, p2_nt_cnt );
220  auto const pp_pi = pi_snp_( avg_a, avg_c, avg_g, avg_t, min_cnt );
221 
222  return std::tuple<double, double, double>{ p1_pi, p2_pi, pp_pi };
223  }
224 
225  // -------------------------------------------------------------------------
226  // Private Member Variables
227  // -------------------------------------------------------------------------
228 
229 private:
230 
231  // Pool sizes
232  double p1_poolsize_ = 0;
233  double p2_poolsize_ = 0;
234 
235  // Theta Pi values for the two populations and their combination
236  utils::NeumaierSum p1_pi_sum_ = 0.0;
237  utils::NeumaierSum p2_pi_sum_ = 0.0;
238  utils::NeumaierSum pp_pi_sum_ = 0.0;
239 
240 };
241 
242 } // namespace population
243 } // namespace genesis
244 
245 #endif // include guard
genesis::utils::CompensatedSum
Compensated summation algorithmm, such as Kahan, Neumaier, and Klein summation.
Definition: compensated_sum.hpp:49
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
genesis::population::SampleCounts::a_count
size_type a_count
Count of all A nucleotides that are present in the sample.
Definition: sample_counts.hpp:66
common.hpp
genesis::population::nucleotide_sum
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:296
genesis::population::FstPoolCalculatorKofler::operator=
FstPoolCalculatorKofler & operator=(FstPoolCalculatorKofler const &)=default
genesis::population::FstPoolCalculatorKofler::~FstPoolCalculatorKofler
virtual ~FstPoolCalculatorKofler() override=default
std.hpp
Provides some valuable additions to STD.
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::population::SampleCounts::t_count
size_type t_count
Count of all T nucleotides that are present in the sample.
Definition: sample_counts.hpp:81
genesis::population::FstPoolCalculatorKofler::f_st_pool_kofler_pi_snp
static std::tuple< double, double, double > f_st_pool_kofler_pi_snp(SampleCounts const &p1, SampleCounts const &p2)
Compute the SNP-based Theta Pi values used above.
Definition: fst_pool_kofler.hpp:174
fst_pool_calculator.hpp
genesis::population::SampleCounts::c_count
size_type c_count
Count of all C nucleotides that are present in the sample.
Definition: sample_counts.hpp:71
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::BaseFstPoolCalculator
Base class to compute FST between two pooled samples, given two instances of SampleCounts.
Definition: fst_pool_calculator.hpp:65
genesis::population::FstPoolCalculatorKofler::FstPoolCalculatorKofler
FstPoolCalculatorKofler(size_t p1_poolsize, size_t p2_poolsize)
Definition: fst_pool_kofler.hpp:78
compensated_sum.hpp
genesis::population::FstPoolCalculatorKofler
Compute the F_ST statistic for pool-sequenced data of Kofler et al as used in PoPoolation2,...
Definition: fst_pool_kofler.hpp:70
genesis::utils::squared
constexpr double squared(double x)
Square of a number.
Definition: common.hpp:223
genesis::population::SampleCounts::g_count
size_type g_count
Count of all G nucleotides that are present in the sample.
Definition: sample_counts.hpp:76