A library for working with phylogenetic and population genetic data.
v0.32.0
fst_pool_unbiased.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_FST_POOL_UNBIASED_H_
2 #define GENESIS_POPULATION_FUNCTION_FST_POOL_UNBIASED_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
47 
48 #include <cassert>
49 #include <cmath>
50 #include <limits>
51 #include <memory>
52 #include <stdexcept>
53 #include <string>
54 #include <tuple>
55 #include <utility>
56 #include <vector>
57 
58 namespace genesis {
59 namespace population {
60 
61 // =================================================================================================
62 // Fst Pool Calculator Karlsson
63 // =================================================================================================
64 
83 {
84 public:
85 
86  // -------------------------------------------------------------------------
87  // Typedefs and Enums
88  // -------------------------------------------------------------------------
89 
90  enum class Estimator
91  {
92  kNei,
93  kHudson
94  };
95 
96  struct PiValues
97  {
98  double pi_within;
99  double pi_between;
100  double pi_total;
101  };
102 
103  // -------------------------------------------------------------------------
104  // Constructors and Rule of Five
105  // -------------------------------------------------------------------------
106 
108  size_t smp1_poolsize,
109  size_t smp2_poolsize,
110  WindowAveragePolicy window_average_policy,
112  )
113  : smp1_poolsize_( smp1_poolsize )
114  , smp2_poolsize_( smp2_poolsize )
115  , avg_policy_( window_average_policy )
116  , estimator_( est )
117  {}
118 
119  virtual ~FstPoolCalculatorUnbiased() override = default;
120 
123 
126 
127  // -------------------------------------------------------------------------
128  // Abstract Members
129  // -------------------------------------------------------------------------
130 
131 private:
132 
133  void reset_() override
134  {
135  // Reset the internal counters, but not the pool sizes,
136  // so that the instance can be reused across windows.
137  sample_filter_stats_smp1_.clear();
138  sample_filter_stats_smp2_.clear();
139  sample_filter_stats_b_.clear();
140  pi_w_smp1_sum_ = 0.0;
141  pi_w_smp2_sum_ = 0.0;
142  pi_b_sum_ = 0.0;
143  }
144 
145  void process_( SampleCounts const& smp1, SampleCounts const& smp2 ) override
146  {
147  // Helper function for the two components of pi within
148  auto pi_within_partial_ = [](
149  double poolsize, double freq_a, double freq_c, double freq_g, double freq_t, double nt_cnt
150  ) {
151  assert( poolsize > 1.0 );
152 
153  double result = 1.0;
154  result -= utils::squared( freq_a );
155  result -= utils::squared( freq_c );
156  result -= utils::squared( freq_g );
157  result -= utils::squared( freq_t );
158  result *= nt_cnt / ( nt_cnt - 1.0 );
159  result *= poolsize / ( poolsize - 1.0 );
160  return result;
161  };
162 
163  // Prepare frequencies in sample 1. We only compute them when used.
164  double smp1_nt_cnt = 0.0;
165  double smp1_freq_a = 0.0;
166  double smp1_freq_c = 0.0;
167  double smp1_freq_g = 0.0;
168  double smp1_freq_t = 0.0;
169 
170  // Prepare frequencies in sample 2. We only compute them when used.
171  double smp2_nt_cnt = 0.0;
172  double smp2_freq_a = 0.0;
173  double smp2_freq_c = 0.0;
174  double smp2_freq_g = 0.0;
175  double smp2_freq_t = 0.0;
176 
177  // Compute pi within for smp1
178  ++sample_filter_stats_smp1_[ smp1.status.get() ];
179  if( smp1.status.passing() ) {
180  smp1_nt_cnt = static_cast<double>( nucleotide_sum( smp1 ));
181  smp1_freq_a = static_cast<double>( smp1.a_count ) / smp1_nt_cnt;
182  smp1_freq_c = static_cast<double>( smp1.c_count ) / smp1_nt_cnt;
183  smp1_freq_g = static_cast<double>( smp1.g_count ) / smp1_nt_cnt;
184  smp1_freq_t = static_cast<double>( smp1.t_count ) / smp1_nt_cnt;
185 
186  auto const pi_within_smp1 = pi_within_partial_(
187  smp1_poolsize_, smp1_freq_a, smp1_freq_c, smp1_freq_g, smp1_freq_t, smp1_nt_cnt
188  );
189  if( std::isfinite( pi_within_smp1 )) {
190  pi_w_smp1_sum_ += pi_within_smp1;
191  assert( smp1.a_count + smp1.c_count + smp1.g_count + smp1.t_count > 0 );
192  } else {
193  assert( smp1.a_count + smp1.c_count + smp1.g_count + smp1.t_count <= 1 );
194  }
195  }
196 
197  // Compute pi within for smp2
198  ++sample_filter_stats_smp2_[ smp2.status.get() ];
199  if( smp2.status.passing() ) {
200  smp2_nt_cnt = static_cast<double>( nucleotide_sum( smp2 ));
201  smp2_freq_a = static_cast<double>( smp2.a_count ) / smp2_nt_cnt;
202  smp2_freq_c = static_cast<double>( smp2.c_count ) / smp2_nt_cnt;
203  smp2_freq_g = static_cast<double>( smp2.g_count ) / smp2_nt_cnt;
204  smp2_freq_t = static_cast<double>( smp2.t_count ) / smp2_nt_cnt;
205 
206  auto const pi_within_smp2 = pi_within_partial_(
207  smp2_poolsize_, smp2_freq_a, smp2_freq_c, smp2_freq_g, smp2_freq_t, smp2_nt_cnt
208  );
209  if( std::isfinite( pi_within_smp2 )) {
210  pi_w_smp2_sum_ += pi_within_smp2;
211  assert( smp2.a_count + smp2.c_count + smp2.g_count + smp2.t_count > 0 );
212  } else {
213  assert( smp2.a_count + smp2.c_count + smp2.g_count + smp2.t_count <= 1 );
214  }
215  }
216 
217  // Compute pi between
218  if( smp1.status.passing() && smp2.status.passing() ) {
219  double pi_between = 1.0;
220  pi_between -= smp1_freq_a * smp2_freq_a;
221  pi_between -= smp1_freq_c * smp2_freq_c;
222  pi_between -= smp1_freq_g * smp2_freq_g;
223  pi_between -= smp1_freq_t * smp2_freq_t;
224  if( std::isfinite( pi_between )) {
225  pi_b_sum_ += pi_between;
226  }
227 
228  ++sample_filter_stats_b_[SampleCountsFilterTag::kPassed];
229  } else if( ! smp1.status.passing() && ! smp2.status.passing() ) {
230  ++sample_filter_stats_b_[ std::min( smp1.status.get(), smp2.status.get() )];
231  } else {
232  ++sample_filter_stats_b_[ std::max( smp1.status.get(), smp2.status.get() )];
233  }
234  }
235 
236  double get_result_() const override
237  {
238  // We have a bit of an unfortunate situation here from a software design point of view.
239  // The other fst calculator classes (Kofler and Karlsson, at the moment) do not use the
240  // window normalization, and it would be weird to use dummies there. So instead, in the
241  // FstPoolProcessor, we cast accordingly (which is ugly as well), to avoid this.
242 
243  // Here, we offer an additional function though for the non-window-averaging case,
244  // that just returns the sum. It uses the special overloads below.
245  switch( estimator_ ) {
246  case Estimator::kNei: {
247  return get_result_pair().first;
248  }
249  case Estimator::kHudson: {
250  return get_result_pair().second;
251  }
252  default: {
253  throw std::invalid_argument( "Invalid FstPoolCalculatorUnbiased::Estimator" );
254  }
255  }
256  return 0.0;
257  }
258 
259  // -------------------------------------------------------------------------
260  // Additional Members
261  // -------------------------------------------------------------------------
262 
263 public:
264 
265  // -------------------------------------------------------------------------
266  // With Window Averaging
267  // -------------------------------------------------------------------------
268 
269  template<class D>
270  double get_result(
271  BaseWindow<D> const& window,
272  std::shared_ptr<GenomeLocusSet> provided_loci,
273  VariantFilterStats const& variant_filter_stats
274  ) const {
275  switch( estimator_ ) {
276  case Estimator::kNei: {
277  return get_result_pair( window, provided_loci, variant_filter_stats ).first;
278  }
279  case Estimator::kHudson: {
280  return get_result_pair( window, provided_loci, variant_filter_stats ).second;
281  }
282  default: {
283  throw std::invalid_argument( "Invalid FstPoolCalculatorUnbiased::Estimator" );
284  }
285  }
286  return 0.0;
287  }
288 
292  template<class D>
293  std::pair<double, double> get_result_pair(
294  BaseWindow<D> const& window,
295  std::shared_ptr<GenomeLocusSet> provided_loci,
296  VariantFilterStats const& variant_filter_stats
297  ) const {
298  // Get the components that we need, each normalized using their own filter stats.
299  auto const pi_within = get_pi_within( window, provided_loci, variant_filter_stats );
300  auto const pi_between = get_pi_between( window, provided_loci, variant_filter_stats );
301  auto const pi_total = get_pi_total( pi_within, pi_between );
302 
303  // Final computation of our two FST estimators, using Nei and Hudson, respectively.
304  double const fst_nei = 1.0 - ( pi_within / pi_total );
305  double const fst_hud = 1.0 - ( pi_within / pi_between );
306  return { fst_nei, fst_hud };
307  }
308 
309  template<class D>
311  BaseWindow<D> const& window,
312  std::shared_ptr<GenomeLocusSet> provided_loci,
313  VariantFilterStats const& variant_filter_stats
314  ) const {
315  auto const pi_w_smp1 = pi_w_smp1_sum_.get() / window_average_denominator(
316  avg_policy_, window, provided_loci, variant_filter_stats, sample_filter_stats_smp1_
317  );
318  auto const pi_w_smp2 = pi_w_smp2_sum_.get() / window_average_denominator(
319  avg_policy_, window, provided_loci, variant_filter_stats, sample_filter_stats_smp2_
320  );
321  return 0.5 * ( pi_w_smp1 + pi_w_smp2 );
322  }
323 
324  template<class D>
326  BaseWindow<D> const& window,
327  std::shared_ptr<GenomeLocusSet> provided_loci,
328  VariantFilterStats const& variant_filter_stats
329  ) const {
330  return pi_b_sum_.get() / window_average_denominator(
331  avg_policy_, window, provided_loci, variant_filter_stats, sample_filter_stats_b_
332  );
333  }
334 
335  double get_pi_total( double pi_within, double pi_between ) const
336  {
337  return 0.5 * ( pi_within + pi_between );
338  }
339 
340  template<class D>
341  double get_pi_total(
342  BaseWindow<D> const& window,
343  std::shared_ptr<GenomeLocusSet> provided_loci,
344  VariantFilterStats const& variant_filter_stats
345  ) const {
346  auto const pi_within = get_pi_within( window, provided_loci, variant_filter_stats );
347  auto const pi_between = get_pi_between( window, provided_loci, variant_filter_stats );
348  return get_pi_total( pi_within, pi_between );
349  }
350 
351  template<class D>
353  BaseWindow<D> const& window,
354  std::shared_ptr<GenomeLocusSet> provided_loci,
355  VariantFilterStats const& variant_filter_stats
356  ) const {
357  PiValues result;
358  result.pi_within = get_pi_within( window, provided_loci, variant_filter_stats );
359  result.pi_between = get_pi_between( window, provided_loci, variant_filter_stats );
360  result.pi_total = get_pi_total( result.pi_within, result.pi_between );
361  return result;
362  }
363 
365  {
366  return avg_policy_;
367  }
368 
369  // -------------------------------------------------------------------------
370  // Without Window Averaging
371  // -------------------------------------------------------------------------
372 
376  std::pair<double, double> get_result_pair() const {
377  // Get the components that we need, each normalized using their own filter stats.
378  auto const pi_within = get_pi_within();
379  auto const pi_between = get_pi_between();
380  auto const pi_total = get_pi_total( pi_within, pi_between );
381 
382  // Final computation of our two FST estimators, using Nei and Hudson, respectively.
383  double const fst_nei = 1.0 - ( pi_within / pi_total );
384  double const fst_hud = 1.0 - ( pi_within / pi_between );
385  return { fst_nei, fst_hud };
386  }
387 
388  double get_pi_within() const
389  {
390  auto const pi_w_smp1 = pi_w_smp1_sum_.get();
391  auto const pi_w_smp2 = pi_w_smp2_sum_.get();
392  return 0.5 * ( pi_w_smp1 + pi_w_smp2 );
393  }
394 
395  double get_pi_between() const
396  {
397  return pi_b_sum_.get();
398  }
399 
400  double get_pi_total() const
401  {
402  auto const pi_within = get_pi_within();
403  auto const pi_between = get_pi_between();
404  return get_pi_total( pi_within, pi_between );
405  }
406 
408  {
409  PiValues result;
410  result.pi_within = get_pi_within();
411  result.pi_between = get_pi_between();
412  result.pi_total = get_pi_total( result.pi_within, result.pi_between );
413  return result;
414  }
415 
416  // -------------------------------------------------------------------------
417  // Private Member Variables
418  // -------------------------------------------------------------------------
419 
420 private:
421 
422  // Pool sizes
423  size_t smp1_poolsize_ = 0;
424  size_t smp2_poolsize_ = 0;
425 
426  // Parameters
427  WindowAveragePolicy avg_policy_;
428  Estimator estimator_ = Estimator::kNei;
429 
430  // Filter stats of everything that is processed here.
431  SampleCountsFilterStats sample_filter_stats_smp1_;
432  SampleCountsFilterStats sample_filter_stats_smp2_;
433  SampleCountsFilterStats sample_filter_stats_b_;
434 
435  // Sums over the window of pi within for both pools, and pi between them.
436  utils::NeumaierSum pi_w_smp1_sum_ = 0.0;
437  utils::NeumaierSum pi_w_smp2_sum_ = 0.0;
438  utils::NeumaierSum pi_b_sum_ = 0.0;
439 
440 };
441 
442 // =================================================================================================
443 // Estimator Helper Functions
444 // =================================================================================================
445 
448 ) {
449  switch( estimator ) {
451  return "Nei";
452  }
454  return "Hudson";
455  }
456  default: {
457  throw std::invalid_argument( "Invalid FstPoolCalculatorUnbiased::Estimator" );
458  }
459  }
460 }
461 
463  std::string const& str
464 ) {
465  auto const low = genesis::utils::to_lower( str );
466  if( low == "nei" ) {
468  }
469  if( low == "hudson" ) {
471  }
472  throw std::invalid_argument( "Invalid FstPoolCalculatorUnbiased::Estimator: " + str );
473 }
474 
475 } // namespace population
476 } // namespace genesis
477 
478 #endif // include guard
genesis::population::FstPoolCalculatorUnbiased::get_pi_values
PiValues get_pi_values() const
Definition: fst_pool_unbiased.hpp:407
genesis::population::FstPoolCalculatorUnbiased::get_pi_between
double get_pi_between(BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci, VariantFilterStats const &variant_filter_stats) const
Definition: fst_pool_unbiased.hpp:325
genesis::population::FstPoolCalculatorUnbiased::get_result_pair
std::pair< double, double > get_result_pair() const
Get both variants of FST, following Nei, and following Hudson, as a pair.
Definition: fst_pool_unbiased.hpp:376
functions.hpp
genesis::utils::CompensatedSum
Compensated summation algorithmm, such as Kahan, Neumaier, and Klein summation.
Definition: compensated_sum.hpp:49
common.hpp
base_window.hpp
genesis::population::FstPoolCalculatorUnbiased::PiValues
Definition: fst_pool_unbiased.hpp:96
genesis::population::nucleotide_sum
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:296
genesis::population::FstPoolCalculatorUnbiased::get_pi_total
double get_pi_total(double pi_within, double pi_between) const
Definition: fst_pool_unbiased.hpp:335
genesis::population::FstPoolCalculatorUnbiased
Compute our unbiased F_ST statistic for pool-sequenced data for two ranges of SampleCountss.
Definition: fst_pool_unbiased.hpp:82
genesis::population::FstPoolCalculatorUnbiased::Estimator
Estimator
Definition: fst_pool_unbiased.hpp:90
std.hpp
Provides some valuable additions to STD.
genesis::population::FstPoolCalculatorUnbiased::get_pi_values
PiValues get_pi_values(BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci, VariantFilterStats const &variant_filter_stats) const
Definition: fst_pool_unbiased.hpp:352
genesis::population::FstPoolCalculatorUnbiased::get_pi_between
double get_pi_between() const
Definition: fst_pool_unbiased.hpp:395
sample_counts_filter.hpp
string.hpp
Provides some commonly used string utility functions.
window_average.hpp
filter_stats.hpp
fst_pool_calculator.hpp
genesis::population::FstPoolCalculatorUnbiased::Estimator::kHudson
@ kHudson
filter_status.hpp
genesis::population::FstPoolCalculatorUnbiased::operator=
FstPoolCalculatorUnbiased & operator=(FstPoolCalculatorUnbiased const &)=default
genesis::population::FstPoolCalculatorUnbiased::get_window_average_policy
WindowAveragePolicy get_window_average_policy() const
Definition: fst_pool_unbiased.hpp:364
genesis::population::SampleCountsFilterTag::kPassed
@ kPassed
Sample has passed all filters.
genesis::population::fst_pool_unbiased_estimator_from_string
FstPoolCalculatorUnbiased::Estimator fst_pool_unbiased_estimator_from_string(std::string const &str)
Definition: fst_pool_unbiased.hpp:462
genesis::population::FstPoolCalculatorUnbiased::get_result
double get_result(BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci, VariantFilterStats const &variant_filter_stats) const
Definition: fst_pool_unbiased.hpp:270
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::window_average_denominator
double window_average_denominator(WindowAveragePolicy policy, BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci, VariantFilterStats const &variant_filter_stats, SampleCountsFilterStats const &sample_counts_filter_stats)
Get the denoninator to use for averaging an estimator across a window.
Definition: window_average.hpp:257
genome_locus_set.hpp
genesis::utils::CompensatedSum::get
double get() const
Definition: compensated_sum.hpp:221
genesis::population::FstPoolCalculatorUnbiased::PiValues::pi_between
double pi_between
Definition: fst_pool_unbiased.hpp:99
variant_filter.hpp
genesis::population::FstPoolCalculatorUnbiased::get_pi_within
double get_pi_within(BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci, VariantFilterStats const &variant_filter_stats) const
Definition: fst_pool_unbiased.hpp:310
genesis::population::WindowAveragePolicy
WindowAveragePolicy
Select the method to use for computing window averages of statistic estimators.
Definition: window_average.hpp:65
genesis::population::FstPoolCalculatorUnbiased::get_result_pair
std::pair< double, double > get_result_pair(BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci, VariantFilterStats const &variant_filter_stats) const
Get both variants of FST, following Nei, and following Hudson, as a pair.
Definition: fst_pool_unbiased.hpp:293
genesis::population::BaseFstPoolCalculator
Base class to compute FST between two pooled samples, given two instances of SampleCounts.
Definition: fst_pool_calculator.hpp:65
genesis::population::FstPoolCalculatorUnbiased::PiValues::pi_within
double pi_within
Definition: fst_pool_unbiased.hpp:98
genesis::population::fst_pool_unbiased_estimator_to_string
std::string fst_pool_unbiased_estimator_to_string(FstPoolCalculatorUnbiased::Estimator estimator)
Definition: fst_pool_unbiased.hpp:446
genesis::population::FstPoolCalculatorUnbiased::get_pi_total
double get_pi_total(BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci, VariantFilterStats const &variant_filter_stats) const
Definition: fst_pool_unbiased.hpp:341
genesis::population::FstPoolCalculatorUnbiased::~FstPoolCalculatorUnbiased
virtual ~FstPoolCalculatorUnbiased() override=default
genesis::population::FilterStats< VariantFilterTag >
genesis::population::FstPoolCalculatorUnbiased::get_pi_total
double get_pi_total() const
Definition: fst_pool_unbiased.hpp:400
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:221
genesis::population::FstPoolCalculatorUnbiased::get_pi_within
double get_pi_within() const
Definition: fst_pool_unbiased.hpp:388
genesis::population::FilterStats::clear
void clear()
Definition: filter_stats.hpp:175
genesis::population::BaseWindow
Base class for Window and WindowView, to share common functionality.
Definition: base_window.hpp:60
genesis::population::FstPoolCalculatorUnbiased::Estimator::kNei
@ kNei
compensated_sum.hpp
genesis::population::FstPoolCalculatorUnbiased::FstPoolCalculatorUnbiased
FstPoolCalculatorUnbiased(size_t smp1_poolsize, size_t smp2_poolsize, WindowAveragePolicy window_average_policy, Estimator est=Estimator::kNei)
Definition: fst_pool_unbiased.hpp:107
genesis::population::FstPoolCalculatorUnbiased::PiValues::pi_total
double pi_total
Definition: fst_pool_unbiased.hpp:100
genesis::utils::squared
constexpr double squared(double x)
Square of a number.
Definition: common.hpp:223