A library for working with phylogenetic and population genetic data.
v0.27.0
structure.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
34 
35 #include <cassert>
36 #include <cmath>
37 #include <stdexcept>
38 
39 namespace genesis {
40 namespace population {
41 
42 // =================================================================================================
43 // F_ST Pool Kofler
44 // =================================================================================================
45 
46 std::tuple<double, double, double> f_st_pool_kofler_pi_snp(
47  BaseCounts const& p1, BaseCounts const& p2
48 ) {
49  using namespace genesis::utils;
50 
51  // _pi / _uncorrectedPiPerSNPFromFreqs
52  auto pi_snp_ = [](
53  double freq_a, double freq_c, double freq_g, double freq_t, double nt_cnt
54  ) {
55  double result = 1.0;
56  result -= squared( freq_a );
57  result -= squared( freq_c );
58  result -= squared( freq_g );
59  result -= squared( freq_t );
60  result *= nt_cnt / ( nt_cnt - 1.0 );
61  return result;
62  };
63 
64  // _calculateSNPFrequencies
65  // We cannot/do not want to simply call our heterozygosity() function here, as we need to
66  // re-use the frequencies anyway to compute their average, so we do everything at once here.
67 
68  // Get frequencies in sample 1
69  double const p1_nt_cnt = static_cast<double>( nucleotide_sum( p1 )); // eucov
70  double const p1_freq_a = static_cast<double>( p1.a_count ) / p1_nt_cnt;
71  double const p1_freq_c = static_cast<double>( p1.c_count ) / p1_nt_cnt;
72  double const p1_freq_g = static_cast<double>( p1.g_count ) / p1_nt_cnt;
73  double const p1_freq_t = static_cast<double>( p1.t_count ) / p1_nt_cnt;
74 
75  // Get frequencies in sample 2
76  double const p2_nt_cnt = static_cast<double>( nucleotide_sum( p2 )); // eucov
77  double const p2_freq_a = static_cast<double>( p2.a_count ) / p2_nt_cnt;
78  double const p2_freq_c = static_cast<double>( p2.c_count ) / p2_nt_cnt;
79  double const p2_freq_g = static_cast<double>( p2.g_count ) / p2_nt_cnt;
80  double const p2_freq_t = static_cast<double>( p2.t_count ) / p2_nt_cnt;
81 
82  // Compute their average
83  double const min_cnt = std::min( p1_nt_cnt, p2_nt_cnt );
84  double const avg_a = ( p1_freq_a + p2_freq_a ) / 2.0;
85  double const avg_c = ( p1_freq_c + p2_freq_c ) / 2.0;
86  double const avg_g = ( p1_freq_g + p2_freq_g ) / 2.0;
87  double const avg_t = ( p1_freq_t + p2_freq_t ) / 2.0;
88 
89  // _calculatePivalues / _pi / _uncorrectedPiPerSNPFromFreqs
90  auto const p1_pi = pi_snp_( p1_freq_a, p1_freq_c, p1_freq_g, p1_freq_t, p1_nt_cnt );
91  auto const p2_pi = pi_snp_( p2_freq_a, p2_freq_c, p2_freq_g, p2_freq_t, p2_nt_cnt );
92  auto const pp_pi = pi_snp_( avg_a, avg_c, avg_g, avg_t, min_cnt );
93 
94  return std::tuple<double, double, double>{ p1_pi, p2_pi, pp_pi };
95 }
96 
97 // =================================================================================================
98 // F_ST Pool Karlsson
99 // =================================================================================================
100 
101 std::pair<double, double> f_st_pool_karlsson_nkdk(
102  std::pair<SortedBaseCounts, SortedBaseCounts> const& sample_counts
103 ) {
104  // PoPoolation2 function: calculate_nk_dk
105  using namespace genesis::utils;
106 
107  // Error check. We only want biallelic SNPs, so we check that the smallesst two values
108  // here are actually zero.
109  if(
110  sample_counts.first[2].count != 0 || sample_counts.first[3].count != 0 ||
111  sample_counts.second[2].count != 0 || sample_counts.second[3].count != 0
112  ) {
113  throw std::runtime_error(
114  "In f_st_pool_karlsson(): Expecting biallelic SNPs where only "
115  "two nucleotide counts are > 0, but found more non-zero counts."
116  );
117  }
118 
119  // We checked that we have biallelic counts here. Assert this again.
120  // Also assert that the bases are in the same order in both samples.
121  assert( sample_counts.first[2].count == 0 && sample_counts.first[3].count == 0 );
122  assert( sample_counts.second[2].count == 0 && sample_counts.second[3].count == 0 );
123  assert(
124  sample_counts.first[0].base == sample_counts.second[0].base &&
125  sample_counts.first[1].base == sample_counts.second[1].base &&
126  sample_counts.first[2].base == sample_counts.second[2].base &&
127  sample_counts.first[3].base == sample_counts.second[3].base
128  );
129 
130  // Get the major allele count (`a` here and in PoPoolation2),
131  // the minor allele count (`b` here, not used in PoPoolation2 under that name),
132  // and the total coverage (`n` here and in PoPoolation2).
133  auto const a_1 = static_cast<double>( sample_counts.first[0].count );
134  auto const b_1 = static_cast<double>( sample_counts.first[1].count );
135  auto const n_1 = a_1 + b_1;
136  auto const a_2 = static_cast<double>( sample_counts.second[0].count );
137  auto const b_2 = static_cast<double>( sample_counts.second[1].count );
138  auto const n_2 = a_2 + b_2;
139 
140  // Edge case
141  if( n_1 <= 1.0 || n_2 <= 1.0 ) {
142  return { 0.0, 0.0 };
143  }
144  assert( n_1 > 1.0 );
145  assert( n_2 > 1.0 );
146  assert( a_1 <= n_1 );
147  assert( a_2 <= n_2 );
148 
149  // PoPoolation2 functions: calculate_h1, calculate_h2
150  double const h1 = ( a_1 * b_1 ) / ( n_1 * ( n_1 - 1.0 ));
151  double const h2 = ( a_2 * b_2 ) / ( n_2 * ( n_2 - 1.0 ));
152 
153  // PoPoolation2 function: calculate_nk_dk
154  double const sqr = squared(( a_1 / n_1 ) - ( a_2 / n_2 ));
155  double const sub = ( h1 / n_1 + h2 / n_2 );
156  double const nk = sqr - sub;
157  double const dk = nk + h1 + h2;
158 
159  return { nk, dk };
160 }
161 
162 // =================================================================================================
163 // F_ST Pool Unbiased (Spence)
164 // =================================================================================================
165 
166 std::tuple<double, double, double> f_st_pool_unbiased_pi_snp(
167  size_t p1_poolsize, size_t p2_poolsize,
168  BaseCounts const& p1_counts, BaseCounts const& p2_counts
169 ) {
170  using namespace genesis::utils;
171 
172  // There is some code duplication from f_st_pool_kofler_pi_snp() here, but for speed reasons,
173  // we keep it that way for now. Not worth calling more functions than needed.
174 
175  // Helper function for the two components of pi within
176  auto pi_within_partial_ = [](
177  double poolsize, double freq_a, double freq_c, double freq_g, double freq_t, double nt_cnt
178  ) {
179  assert( poolsize > 1.0 );
180 
181  double result = 1.0;
182  result -= squared( freq_a );
183  result -= squared( freq_c );
184  result -= squared( freq_g );
185  result -= squared( freq_t );
186  result *= nt_cnt / ( nt_cnt - 1.0 );
187  result *= poolsize / ( poolsize - 1.0 );
188  return result;
189  };
190 
191  // Get frequencies in sample 1
192  double const p1_nt_cnt = static_cast<double>( nucleotide_sum( p1_counts ));
193  double const p1_freq_a = static_cast<double>( p1_counts.a_count ) / p1_nt_cnt;
194  double const p1_freq_c = static_cast<double>( p1_counts.c_count ) / p1_nt_cnt;
195  double const p1_freq_g = static_cast<double>( p1_counts.g_count ) / p1_nt_cnt;
196  double const p1_freq_t = static_cast<double>( p1_counts.t_count ) / p1_nt_cnt;
197 
198  // Get frequencies in sample 2
199  double const p2_nt_cnt = static_cast<double>( nucleotide_sum( p2_counts ));
200  double const p2_freq_a = static_cast<double>( p2_counts.a_count ) / p2_nt_cnt;
201  double const p2_freq_c = static_cast<double>( p2_counts.c_count ) / p2_nt_cnt;
202  double const p2_freq_g = static_cast<double>( p2_counts.g_count ) / p2_nt_cnt;
203  double const p2_freq_t = static_cast<double>( p2_counts.t_count ) / p2_nt_cnt;
204 
205  // Compute pi within
206  auto const pi_within = 0.5 * (
207  pi_within_partial_( p1_poolsize, p1_freq_a, p1_freq_c, p1_freq_g, p1_freq_t, p1_nt_cnt ) +
208  pi_within_partial_( p2_poolsize, p2_freq_a, p2_freq_c, p2_freq_g, p2_freq_t, p2_nt_cnt )
209  );
210 
211  // Compute pi between
212  double pi_between = 1.0;
213  pi_between -= p1_freq_a * p2_freq_a;
214  pi_between -= p1_freq_c * p2_freq_c;
215  pi_between -= p1_freq_g * p2_freq_g;
216  pi_between -= p1_freq_t * p2_freq_t;
217 
218  // Compute pi total
219  double const pi_total = 0.5 * ( pi_within + pi_between );
220 
221  return std::tuple<double, double, double>{ pi_within, pi_between, pi_total };
222 }
223 
224 } // namespace population
225 } // namespace genesis
genesis::population::f_st_pool_kofler_pi_snp
std::tuple< double, double, double > f_st_pool_kofler_pi_snp(BaseCounts const &p1, BaseCounts const &p2)
Compute the SNP-based Theta Pi values used in f_st_pool_kofler().
Definition: structure.cpp:46
genesis::population::f_st_pool_karlsson_nkdk
std::pair< double, double > f_st_pool_karlsson_nkdk(std::pair< SortedBaseCounts, SortedBaseCounts > const &sample_counts)
Compute the numerator N_k and denominator D_k needed for the asymptotically unbiased F_ST estimator o...
Definition: structure.cpp:101
genesis::population::BaseCounts::t_count
size_t t_count
Count of all T nucleotides that are present in the sample.
Definition: base_counts.hpp:74
common.hpp
genesis::population::BaseCounts::g_count
size_t g_count
Count of all G nucleotides that are present in the sample.
Definition: base_counts.hpp:69
genesis::population::BaseCounts::a_count
size_t a_count
Count of all A nucleotides that are present in the sample.
Definition: base_counts.hpp:59
structure.hpp
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::population::nucleotide_sum
size_t nucleotide_sum(BaseCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/functions/functions.hpp:222
genesis::population::BaseCounts::c_count
size_t c_count
Count of all C nucleotides that are present in the sample.
Definition: base_counts.hpp:64
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:54
genesis::population::f_st_pool_unbiased_pi_snp
std::tuple< double, double, double > f_st_pool_unbiased_pi_snp(size_t p1_poolsize, size_t p2_poolsize, BaseCounts const &p1_counts, BaseCounts const &p2_counts)
Compute the SNP-based Theta Pi values used in f_st_pool_unbiased().
Definition: structure.cpp:166
genesis::utils::squared
constexpr double squared(double x)
Square of a number.
Definition: common.hpp:247