A library for working with phylogenetic and population genetic data.
v0.32.0
fst_pool_functions.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_FST_POOL_FUNCTIONS_H_
2 #define GENESIS_POPULATION_FUNCTION_FST_POOL_FUNCTIONS_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
47 
48 #include <algorithm>
49 #include <cassert>
50 #include <cmath>
51 #include <limits>
52 #include <stdexcept>
53 #include <string>
54 #include <utility>
55 #include <vector>
56 
57 namespace genesis {
58 namespace population {
59 
60 // =================================================================================================
61 // Compute Helper
62 // =================================================================================================
63 
64 // Can only be used with C++14, as we _need_ generic lambda expressions (with `auto` type-specifier
65 // in the parameter list). Otherwise, the type that is actually needed for the range iterators
66 // in the fst_functor() call in the function is literally impossible to capture. That is because
67 // the type contains a TransformIterator, which has a lambda as one of its template parameters,
68 // which hence cannot be named or captured in any form other than a lambda or an auto context...
69 // See https://stackoverflow.com/a/4846597/4184258 for details. See the version history of this
70 // file (which is currently named `genesis/population/functions/structure.hpp`) for the previous
71 // instance of this function that also worked under C++11, by simply copy-pasting the contents of
72 // compute_pairwise_f_st() to the functions where it is used...
73 // That was ungly, and this function is currently not needed for our downstream tools
74 // (e.g., grenedalf, see https://github.com/lczech/grenedalf), so we just leave it active here
75 // when C++14 is used for compiling.
76 
77 #if __cplusplus >= 201402L
78 
120 template<class ForwardIterator, typename FstFunctor>
121 utils::Matrix<double> compute_pairwise_f_st(
122  ForwardIterator begin, ForwardIterator end,
123  FstFunctor fst_functor
124 ) {
125  // With no data, return empty result.
126  if( begin == end ) {
127  return {};
128  }
129 
130  // Now we know that there are entries in the rage. Use the first one to get the number of
131  // base pair samples in the range. We later check that this is the same for each entry.
132  // Use that size to initialize the resulting matrix.
133  size_t const size = static_cast<std::vector<SampleCounts> const&>( *begin ).size();
134  auto result = utils::Matrix<double>( size, size, 0.0 );
135 
136  // We use a lambda that returns a tranforming rage to select an entry at a given index
137  // in the set of SampleCounts at the current iterator position.
138  auto select_entry = [size]( ForwardIterator begin, ForwardIterator end, size_t index ){
139  // Currently, in order to use Window here, we need to explicitly use std::vector<SampleCounts>
140  // instead of the more generic T... Bit unfortunate, but good enough for now.
141  // Will have to revisit later if we get to use cases where the SampleCounts are not stored
142  // in a vector, but some other container.
143  // using T = typename ForwardIterator::value_type;
145  [size, index]( std::vector<SampleCounts> const& samples ) -> SampleCounts const& {
146  if( samples.size() != size ) {
147  throw std::runtime_error(
148  "In compute_pairwise_f_st(): The number of SampleCounts in the "
149  "provided range is not consistent throughout the iteration."
150  );
151  }
152  return samples[index];
153  },
154  begin, end
155  );
156  };
157 
158  // Loop over all pairs of entries, and compute f_st for each of these pairs.
159  // That is, in the inner code of the two loops, we run the f_st function that takes
160  // two ranges, providing it with a pair of indices for which we compute the value.
161  for( size_t i = 0; i < size; ++i ) {
162  for( size_t j = i + 1; j < size; ++j ) {
163  auto range_i = select_entry( begin, end, i );
164  auto range_j = select_entry( begin, end, j );
165  auto const fst = fst_functor(
166  i, j,
167  range_i.begin(), range_i.end(),
168  range_j.begin(), range_j.end()
169  );
170  result( i, j ) = fst;
171  result( j, i ) = fst;
172  }
173  }
174 
175  return result;
176 }
177 
178 #endif // __cplusplus >= 201402L
179 
180 // =================================================================================================
181 // F_ST Pool Kofler
182 // =================================================================================================
183 
187 template<class ForwardIterator1, class ForwardIterator2>
188 double f_st_pool_kofler( // get_conventional_fstcalculator
189  size_t p1_poolsize, size_t p2_poolsize,
190  ForwardIterator1 p1_begin, ForwardIterator1 p1_end,
191  ForwardIterator2 p2_begin, ForwardIterator2 p2_end,
192  bool only_passing_samples = true
193 ) {
194  // Edge and error cases
195  if( p1_poolsize <= 1 || p2_poolsize <= 1 ) {
196  return std::numeric_limits<double>::quiet_NaN();
197  // throw std::invalid_argument( "Cannot run f_st_pool_kofler() with poolsizes <= 1" );
198  }
199 
200  // Init the calculator.
201  FstPoolCalculatorKofler calc{ p1_poolsize, p2_poolsize };
202 
203  // Iterate the two ranges in parallel. Each iteration is one position in the genome.
204  // In each iteration, p1_it and p2_it point at SampleCounts objects containing nucleotide counts.
205  auto p1_it = p1_begin;
206  auto p2_it = p2_begin;
207  while( p1_it != p1_end && p2_it != p2_end ) {
208  if( only_passing_samples && ( !p1_it->status.passing() || !p2_it->status.passing() )) {
209  continue;
210  }
211 
212  calc.process( *p1_it, *p2_it );
213  ++p1_it;
214  ++p2_it;
215  }
216  if( p1_it != p1_end || p2_it != p2_end ) {
217  throw std::invalid_argument(
218  "In f_st_pool_kofler(): Provided ranges have different length."
219  );
220  }
221 
222  // Compute the final result.
223  return calc.get_result();
224 }
225 
226 #if __cplusplus >= 201402L
227 
235 template<class ForwardIterator>
237  std::vector<size_t> const& poolsizes,
238  ForwardIterator begin, ForwardIterator end
239 ) {
240  return compute_pairwise_f_st(
241  begin, end,
242  [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){
243  if( i >= poolsizes.size() || j >= poolsizes.size() ) {
244  throw std::runtime_error(
245  "In f_st_pool_kofler(): Provided ranges have different lengths that "
246  "are not identical to the number of poolsizes provided."
247  );
248  }
249  return f_st_pool_kofler(
250  poolsizes[i], poolsizes[j],
251  p1_begin, p1_end, p2_begin, p2_end
252  );
253  }
254  );
255 }
256 
257 #endif // __cplusplus >= 201402L
258 
259 // =================================================================================================
260 // F_ST Pool Karlsson
261 // =================================================================================================
262 
266 template<class ForwardIterator1, class ForwardIterator2>
267 double f_st_pool_karlsson( // get_asymptunbiased_fstcalculator
268  ForwardIterator1 p1_begin, ForwardIterator1 p1_end,
269  ForwardIterator2 p2_begin, ForwardIterator2 p2_end,
270  bool only_passing_samples = true
271 ) {
272  using namespace genesis::utils;
273 
274  // Init the calculator.
276 
277  // Iterate both ranges, summing up N_k and D_k for all their entries.
278  auto p1_it = p1_begin;
279  auto p2_it = p2_begin;
280  while( p1_it != p1_end && p2_it != p2_end ) {
281  if( only_passing_samples && ( !p1_it->status.passing() || !p2_it->status.passing() )) {
282  continue;
283  }
284 
285  calc.process( *p1_it, *p2_it );
286  ++p1_it;
287  ++p2_it;
288  }
289  if( p1_it != p1_end || p2_it != p2_end ) {
290  throw std::invalid_argument(
291  "In f_st_pool_karlsson(): Provided ranges have different length."
292  );
293  }
294 
295  return calc.get_result();
296 }
297 
298 #if __cplusplus >= 201402L
299 
307 template<class ForwardIterator>
309  ForwardIterator begin, ForwardIterator end
310 ) {
311  return compute_pairwise_f_st(
312  begin, end,
313  [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){
314  (void) i;
315  (void) j;
316  return f_st_pool_karlsson(
317  p1_begin, p1_end, p2_begin, p2_end
318  );
319  }
320  );
321 }
322 
323 #endif // __cplusplus >= 201402L
324 
325 // =================================================================================================
326 // F_ST Pool Unbiased (Spence)
327 // =================================================================================================
328 
332 template<class ForwardIterator1, class ForwardIterator2>
333 std::pair<double, double> f_st_pool_unbiased(
334  size_t p1_poolsize, size_t p2_poolsize,
335  ForwardIterator1 p1_begin, ForwardIterator1 p1_end,
336  ForwardIterator2 p2_begin, ForwardIterator2 p2_end,
337  bool only_passing_samples = true
338 ) {
339  // Edge and error cases
340  if( p1_poolsize <= 1 || p2_poolsize <= 1 ) {
341  return {
342  std::numeric_limits<double>::quiet_NaN(),
343  std::numeric_limits<double>::quiet_NaN()
344  };
345  // throw std::invalid_argument( "Cannot run f_st_pool_unbiased() with poolsizes <= 1" );
346  }
347 
348  // Init the calculator.
349  // For simplicity in this wrapper, we only allow to normalize the pi values per window
350  // via their sum; in this function, we do not have the additionally needed information
351  // on the Variant Filter Status statistics anyway. If that is needed, use FstPoolProcessor.
352  FstPoolCalculatorUnbiased calc{ p1_poolsize, p2_poolsize, WindowAveragePolicy::kSum };
353 
354  // Iterate the two ranges in parallel. Each iteration is one position in the genome.
355  // In each iteration, p1_it and p2_it point at SampleCounts objects containing nucleotide counts.
356  auto p1_it = p1_begin;
357  auto p2_it = p2_begin;
358  while( p1_it != p1_end && p2_it != p2_end ) {
359  if( only_passing_samples && ( !p1_it->status.passing() || !p2_it->status.passing() )) {
360  continue;
361  }
362 
363  calc.process( *p1_it, *p2_it );
364  ++p1_it;
365  ++p2_it;
366  }
367  if( p1_it != p1_end || p2_it != p2_end ) {
368  throw std::invalid_argument(
369  "In f_st_pool_unbiased(): Provided ranges have different length."
370  );
371  }
372 
373  // We use the result overload here that does not do window averaging, and just returns the sum.
374  return calc.get_result_pair();
375 }
376 
377 #if __cplusplus >= 201402L
378 
389 template<class ForwardIterator>
390 utils::Matrix<double> f_st_pool_unbiased_nei(
391  std::vector<size_t> const& poolsizes,
392  ForwardIterator begin, ForwardIterator end
393 ) {
394  return compute_pairwise_f_st(
395  begin, end,
396  [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){
397  if( i >= poolsizes.size() || j >= poolsizes.size() ) {
398  throw std::runtime_error(
399  "In f_st_pool_unbiased_nei(): Provided ranges have different lengths that "
400  "are not identical to the number of poolsizes provided."
401  );
402  }
403  return f_st_pool_unbiased(
404  poolsizes[i], poolsizes[j],
405  p1_begin, p1_end, p2_begin, p2_end
406  ).first;
407  }
408  );
409 }
410 
421 template<class ForwardIterator>
422 utils::Matrix<double> f_st_pool_unbiased_hudson(
423  std::vector<size_t> const& poolsizes,
424  ForwardIterator begin, ForwardIterator end
425 ) {
426  return compute_pairwise_f_st(
427  begin, end,
428  [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){
429  if( i >= poolsizes.size() || j >= poolsizes.size() ) {
430  throw std::runtime_error(
431  "In f_st_pool_unbiased_hudson(): Provided ranges have different lengths that "
432  "are not identical to the number of poolsizes provided."
433  );
434  }
435  return f_st_pool_unbiased(
436  poolsizes[i], poolsizes[j],
437  p1_begin, p1_end, p2_begin, p2_end
438  ).second;
439  }
440  );
441 }
442 
443 #endif // __cplusplus >= 201402L
444 
445 } // namespace population
446 } // namespace genesis
447 
448 #endif // include guard
genesis::population::BaseFstPoolCalculator::process
void process(SampleCounts const &p1, SampleCounts const &p2)
Definition: fst_pool_calculator.hpp:92
functions.hpp
fst_pool_processor.hpp
fst_pool_kofler.hpp
genesis::population::FstPoolCalculatorUnbiased
Compute our unbiased F_ST statistic for pool-sequenced data for two ranges of SampleCountss.
Definition: fst_pool_unbiased.hpp:82
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::utils::Matrix< double >
sample_counts_filter.hpp
window_average.hpp
genesis::population::f_st_pool_kofler
double f_st_pool_kofler(size_t p1_poolsize, size_t p2_poolsize, ForwardIterator1 p1_begin, ForwardIterator1 p1_end, ForwardIterator2 p2_begin, ForwardIterator2 p2_end, bool only_passing_samples=true)
Compute the F_ST statistic for pool-sequenced data of Kofler et al as used in PoPoolation2,...
Definition: fst_pool_functions.hpp:188
filter_stats.hpp
genesis::population::WindowAveragePolicy::kSum
@ kSum
Simply report the total sum, with no averaging, i.e., the absolute value of the metric.
matrix.hpp
filter_status.hpp
genesis::population::FstPoolCalculatorKarlsson
Compute the F_ST statistic for pool-sequenced data of Karlsson et al as used in PoPoolation2,...
Definition: fst_pool_karlsson.hpp:73
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
transform_iterator.hpp
variant_filter.hpp
fst_pool_unbiased.hpp
genesis::utils::make_transform_range
Range< TransformIterator< TransformFunctor, BaseIterator > > make_transform_range(TransformFunctor unary_func, BaseIterator begin, BaseIterator end)
Construct a transforming range, given the transformation function as well as the underlying base iter...
Definition: transform_iterator.hpp:438
variant.hpp
genesis::population::f_st_pool_karlsson
double f_st_pool_karlsson(ForwardIterator1 p1_begin, ForwardIterator1 p1_end, ForwardIterator2 p2_begin, ForwardIterator2 p2_end, bool only_passing_samples=true)
Compute the F_ST statistic for pool-sequenced data of Karlsson et al as used in PoPoolation2,...
Definition: fst_pool_functions.hpp:267
genesis::population::f_st_pool_unbiased
std::pair< double, double > f_st_pool_unbiased(size_t p1_poolsize, size_t p2_poolsize, ForwardIterator1 p1_begin, ForwardIterator1 p1_end, ForwardIterator2 p2_begin, ForwardIterator2 p2_end, bool only_passing_samples=true)
Compute our unbiased F_ST statistic for pool-sequenced data for two ranges of SampleCountss.
Definition: fst_pool_functions.hpp:333
genesis::population::FstPoolCalculatorKofler
Compute the F_ST statistic for pool-sequenced data of Kofler et al as used in PoPoolation2,...
Definition: fst_pool_kofler.hpp:70
fst_pool_karlsson.hpp