A library for working with phylogenetic data.
v0.25.0
structure.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
35 
36 #include <cassert>
37 #include <cmath>
38 #include <stdexcept>
39 
40 namespace genesis {
41 namespace population {
42 
43 // =================================================================================================
44 // F_ST Conventional Helper Functions
45 // =================================================================================================
46 
47 std::tuple<double, double, double> f_st_conventional_pool_pi_snp(
48  BaseCounts const& p1, BaseCounts const& p2
49 ) {
50  using namespace genesis::utils;
51 
52  // _pi / _uncorrectedPiPerSNPFromFreqs
53  auto pi_snp_ = [](
54  double freq_a, double freq_c, double freq_g, double freq_t, double nt_cnt
55  ) {
56  double result = 1.0;
57  result -= squared( freq_a );
58  result -= squared( freq_c );
59  result -= squared( freq_g );
60  result -= squared( freq_t );
61  result *= nt_cnt / ( nt_cnt - 1.0 );
62  return result;
63  };
64 
65  // _calculateSNPFrequencies
66  // We cannot/do not want to simply call our heterozygosity() function here, as we need to
67  // re-use the frequencies anyway to compute their average, so we do everything at once here.
68 
69  // Get frequencies in sample 1
70  double const p1_nt_cnt = static_cast<double>( nucleotide_sum( p1 )); // eucov
71  double const p1_freq_a = static_cast<double>( p1.a_count ) / p1_nt_cnt;
72  double const p1_freq_c = static_cast<double>( p1.c_count ) / p1_nt_cnt;
73  double const p1_freq_g = static_cast<double>( p1.g_count ) / p1_nt_cnt;
74  double const p1_freq_t = static_cast<double>( p1.t_count ) / p1_nt_cnt;
75 
76  // Get frequencies in sample 2
77  double const p2_nt_cnt = static_cast<double>( nucleotide_sum( p2 )); // eucov
78  double const p2_freq_a = static_cast<double>( p2.a_count ) / p2_nt_cnt;
79  double const p2_freq_c = static_cast<double>( p2.c_count ) / p2_nt_cnt;
80  double const p2_freq_g = static_cast<double>( p2.g_count ) / p2_nt_cnt;
81  double const p2_freq_t = static_cast<double>( p2.t_count ) / p2_nt_cnt;
82 
83  // Compute their average
84  double const min_cnt = std::min( p1_nt_cnt, p2_nt_cnt );
85  double const avg_a = ( p1_freq_a + p2_freq_a ) / 2.0;
86  double const avg_c = ( p1_freq_c + p2_freq_c ) / 2.0;
87  double const avg_g = ( p1_freq_g + p2_freq_g ) / 2.0;
88  double const avg_t = ( p1_freq_t + p2_freq_t ) / 2.0;
89 
90  // _calculatePivalues / _pi / _uncorrectedPiPerSNPFromFreqs
91  auto const p1_pi = pi_snp_( p1_freq_a, p1_freq_c, p1_freq_g, p1_freq_t, p1_nt_cnt );
92  auto const p2_pi = pi_snp_( p2_freq_a, p2_freq_c, p2_freq_g, p2_freq_t, p2_nt_cnt );
93  auto const pp_pi = pi_snp_( avg_a, avg_c, avg_g, avg_t, min_cnt );
94 
95  return std::tuple<double, double, double>{ p1_pi, p2_pi, pp_pi };
96 }
97 
98 // =================================================================================================
99 // F_ST Asymptotically Unbiased (Karlsson) Helper Functions
100 // =================================================================================================
101 
103 {
104  // get_a1a2n1n2
105  // We do not want expensive sorting and looking for nucleotide characters here,
106  // so instead, we use use sorting indices over arrays, and swap values in a sorting network
107  // to quickly find the largest two frequencies. Neat.
108 
109  // Get frequencies in sample 1
110  size_t const p1_cnts[] = { p1.a_count, p1.c_count, p1.g_count, p1.t_count };
111  double const p1_nt_cnt = static_cast<double>( nucleotide_sum( p1 )); // eucov
112  double const p1_freqs[] = {
113  static_cast<double>( p1.a_count ) / p1_nt_cnt,
114  static_cast<double>( p1.c_count ) / p1_nt_cnt,
115  static_cast<double>( p1.g_count ) / p1_nt_cnt,
116  static_cast<double>( p1.t_count ) / p1_nt_cnt
117  };
118 
119  // Get frequencies in sample 2
120  size_t const p2_cnts[] = { p2.a_count, p2.c_count, p2.g_count, p2.t_count };
121  double const p2_nt_cnt = static_cast<double>( nucleotide_sum( p2 )); // eucov
122  double const p2_freqs[] = {
123  static_cast<double>( p2.a_count ) / p2_nt_cnt,
124  static_cast<double>( p2.c_count ) / p2_nt_cnt,
125  static_cast<double>( p2.g_count ) / p2_nt_cnt,
126  static_cast<double>( p2.t_count ) / p2_nt_cnt
127  };
128 
129  // Edge case. If there are no counts at all, we return empty.
130  // The follow up function f_st_asymptotically_unbiased_nkdk() will also catch this edge case,
131  // return zeros as well, and nothing will be added to the total F_ST sum.
132  if( p1_nt_cnt == 0.0 || p2_nt_cnt == 0.0 ) {
133  return FstAN{};
134  }
135 
136  // Compute their averages.
137  double const avg_freqs[] = {
138  ( p1_freqs[0] + p2_freqs[0] ) / 2.0,
139  ( p1_freqs[1] + p2_freqs[1] ) / 2.0,
140  ( p1_freqs[2] + p2_freqs[2] ) / 2.0,
141  ( p1_freqs[3] + p2_freqs[3] ) / 2.0
142  };
143 
144  // Sort quickly via sorting network, putting large values first.
145  // See https://stackoverflow.com/a/25070688/4184258
146  // We however do not directly sort the values, as we instead need the sorting order
147  // to retrieve the values from the original counts. So, we sort their indices instead,
148  // using an array for simplicity of notation (all compiled away, and equivalent to using
149  // index_a = 0; index_b = 1;... instead here).
150  // Technically, there might be a better solution, as we are not interested in the order of
151  // the last two values. But seriously, that won't safe enough to justify the effort.
152  size_t indices[] = { 0, 1, 2, 3 };
153  if( avg_freqs[indices[0]] < avg_freqs[indices[1]] ) {
154  std::swap( indices[0], indices[1] );
155  }
156  if( avg_freqs[indices[2]] < avg_freqs[indices[3]] ) {
157  std::swap( indices[2], indices[3] );
158  }
159  if( avg_freqs[indices[0]] < avg_freqs[indices[2]] ) {
160  std::swap( indices[0], indices[2] );
161  }
162  if( avg_freqs[indices[1]] < avg_freqs[indices[3]] ) {
163  std::swap( indices[1], indices[3] );
164  }
165  if( avg_freqs[indices[1]] < avg_freqs[indices[2]] ) {
166  std::swap( indices[1], indices[2] );
167  }
168 
169  // Now they are sorted, largest ones first.
170  assert( avg_freqs[indices[0]] >= avg_freqs[indices[1]] );
171  assert( avg_freqs[indices[1]] >= avg_freqs[indices[2]] );
172  assert( avg_freqs[indices[2]] >= avg_freqs[indices[3]] );
173 
174  // Error check. We only want biallelic SNPs, so we check that the smallesst two values
175  // here are actually zero.
176  if( avg_freqs[indices[2]] != 0.0 || avg_freqs[indices[3]] != 0.0 ) {
177  throw std::runtime_error(
178  "In f_st_asymptotically_unbiased(): Expecting biallelic SNPs where only "
179  "two counts are > 0, but found more non-zero counts."
180  );
181  }
182 
183  // We checked that we have biallelic counts here. Assert this again.
184  assert( p1_freqs[indices[2]] == 0.0 && p1_freqs[indices[3]] == 0.0 );
185  assert( p2_freqs[indices[2]] == 0.0 && p2_freqs[indices[3]] == 0.0 );
186  assert( p1_cnts[indices[2]] == 0 && p1_cnts[indices[3]] == 0 );
187  assert( p2_cnts[indices[2]] == 0 && p2_cnts[indices[3]] == 0 );
188 
189  // Prepare result. We use explict names here for clarity;
190  // the compiler will get rid of these copies (hopefully).
191  FstAN result;
192  result.a_1 = static_cast<double>( p1_cnts[indices[0]] );
193  result.n_1 = static_cast<double>( p1_cnts[indices[0]] + p1_cnts[indices[1]] );
194  result.a_2 = static_cast<double>( p2_cnts[indices[0]] );
195  result.n_2 = static_cast<double>( p2_cnts[indices[0]] + p2_cnts[indices[1]] );
196  return result;
197 }
198 
199 std::pair<double, double> f_st_asymptotically_unbiased_nkdk( FstAN const& fstan ) // calculate_nk_dk
200 {
201  using namespace genesis::utils;
202 
203  // Edge case
204  if( fstan.n_1 <= 1.0 || fstan.n_2 <= 1.0 ) {
205  return { 0.0, 0.0 };
206  }
207  assert( fstan.n_1 > 1.0 );
208  assert( fstan.n_2 > 1.0 );
209  assert( fstan.a_1 <= fstan.n_1 );
210  assert( fstan.a_2 <= fstan.n_2 );
211 
212  // calculate_h1, calculate_h2
213  double const h1 = ( fstan.a_1 * ( fstan.n_1 - fstan.a_1 )) / ( fstan.n_1 * ( fstan.n_1 - 1.0 ));
214  double const h2 = ( fstan.a_2 * ( fstan.n_2 - fstan.a_2 )) / ( fstan.n_2 * ( fstan.n_2 - 1.0 ));
215 
216  // calculate_nk_dk
217  double const sqr = squared(( fstan.a_1 / fstan.n_1 ) - ( fstan.a_2 / fstan.n_2 ));
218  double const sub = ( h1 / fstan.n_1 + h2 / fstan.n_2 );
219  double const nk = sqr - sub;
220  double const dk = nk + h1 + h2;
221 
222  return { nk, dk };
223 }
224 
225 } // namespace population
226 } // namespace genesis
genesis::placement::swap
void swap(Sample &lhs, Sample &rhs)
Definition: sample.cpp:104
genesis::population::FstAN
Helper struct for the a_1, a_2, n_1, and n_2 values needed for f_st_asymptotically_unbiased().
Definition: structure.hpp:239
genesis::population::FstAN::n_1
double n_1
Definition: structure.hpp:242
genesis::population::BaseCounts::t_count
size_t t_count
Count of all T nucleotides that are present in the sample.
Definition: base_counts.hpp:73
common.hpp
genesis::population::BaseCounts::g_count
size_t g_count
Count of all G nucleotides that are present in the sample.
Definition: base_counts.hpp:68
genesis::population::BaseCounts::a_count
size_t a_count
Count of all A nucleotides that are present in the sample.
Definition: base_counts.hpp:58
genesis::population::f_st_conventional_pool_pi_snp
std::tuple< double, double, double > f_st_conventional_pool_pi_snp(BaseCounts const &p1, BaseCounts const &p2)
Compute the SNP-based Theta Pi values used in f_st_conventional_pool()
Definition: structure.cpp:47
structure.hpp
genesis::population::FstAN::n_2
double n_2
Definition: structure.hpp:244
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::population::FstAN::a_2
double a_2
Definition: structure.hpp:243
genesis::population::nucleotide_sum
size_t nucleotide_sum(BaseCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: functions/base_counts.hpp:177
genesis::population::BaseCounts::c_count
size_t c_count
Count of all C nucleotides that are present in the sample.
Definition: base_counts.hpp:63
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
base_counts.hpp
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:53
genesis::population::FstAN::a_1
double a_1
Definition: structure.hpp:241
genesis::population::f_st_asymptotically_unbiased_nkdk
std::pair< double, double > f_st_asymptotically_unbiased_nkdk(FstAN const &fstan)
Compute the N_k and D_k values needed for the asymptotically unbiased F_ST estimator of Karlsson et a...
Definition: structure.cpp:199
genesis::utils::squared
constexpr double squared(double x)
Square of a number.
Definition: common.hpp:241
genesis::population::f_st_asymptotically_unbiased_a1n1a2n2
FstAN f_st_asymptotically_unbiased_a1n1a2n2(BaseCounts const &p1, BaseCounts const &p2)
Compute the a and n values needed for the asymptotically unbiased F_ST estimator of Karlsson et al.
Definition: structure.cpp:102