A library for working with phylogenetic and population genetic data.
v0.32.0
window_average.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_WINDOW_AVERAGE_H_
2 #define GENESIS_POPULATION_FUNCTION_WINDOW_AVERAGE_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
43 
44 #include <cassert>
45 #include <limits>
46 #include <memory>
47 #include <stdexcept>
48 #include <string>
49 
50 namespace genesis {
51 namespace population {
52 
53 // =================================================================================================
54 // Window Averaging
55 // =================================================================================================
56 
66 {
76 
87 
104  kValidLoci,
105 
119  kValidSnps,
120 
124  kSum,
125 
135 };
136 
145 template<class D>
146 inline size_t get_window_length( BaseWindow<D> const& window )
147 {
148  // If the window is of type GenomeWindowStream, its total length is the sum of all lengths
149  // of the chromosomes that the genome has covered.
150  if( window.is_whole_genome() ) {
151  size_t window_length = 0;
152  for( auto const& chr_len : window.chromosomes() ) {
153  window_length += chr_len.second;
154  }
155  return window_length;
156  }
157 
158  // In all other cases, we simply use the window width function.
159  return window.width();
160 }
161 
165 template<class D>
167  BaseWindow<D> const& window,
168  std::shared_ptr<GenomeLocusSet> provided_loci
169 ) {
170  // We need a provided loci mask for this function.
171  if( !provided_loci ) {
172  throw std::invalid_argument(
173  "Cannot comput window average denominator from provided loci mask, "
174  "as no such mask was provided."
175  );
176  }
177 
178  // Helper function that takes a chromosome and positions on it, and returns the number
179  // of set loci on the provided mask in that window.
180  auto get_chr_loci_count_ = [&provided_loci]( std::string const& chr, size_t first, size_t last )
181  {
182  // Position checks. Should not happen if our internal usage is correct.
183  if( first == 0 || last == 0 || first > last ) {
184  throw std::invalid_argument(
185  "Invalid positions first=" + std::to_string( first ) + " last=" +
186  std::to_string(last) + " on chromosome \"" + chr +
187  "\" for computing provided loci mask window denominator."
188  );
189  }
190 
191  // Get the chromosome. This can fail if the user did not provide a fitting mask.
192  if( ! provided_loci->has_chromosome( chr )) {
193  throw std::runtime_error(
194  "Cannot compute provided loci on chromosome \"" + chr +
195  "\", as the provided loci mask does not contain the chromosome."
196  );
197  }
198  auto const& bv = provided_loci->chromosome_positions( chr );
199 
200  // Mask check. In our internal usage, this should not fail, but we check anyway,
201  // in case this function is called with a mask that is not meant for the given purpose.
202  if( bv.get(0) ) {
203  throw std::invalid_argument(
204  "Invvalid provided loci mask with bit 0 set."
205  );
206  }
207 
208  // Another check based on user data. Can fail if the user did not provide a fitting mask.
209  if( last >= bv.size() ) {
210  throw std::runtime_error(
211  "Cannot compute provided loci on chromosome \"" + chr +
212  "\", as the provided loci mask for the chromosome has length " +
213  std::to_string( bv.size() - 1 ) + ", but the window covers positions " +
214  std::to_string( first ) + "-" + std::to_string( last )
215  );
216  }
217 
218  // Finally, we have checked everything. Our first and last position are both inclusive,
219  // while the bitvector count uses past-the-end, so we need to add one here for the last.
220  return bv.count( first, last + 1 );
221  };
222 
223  // If the window is a WindowStream over a whole genome, we use all its chromosomes.
224  // This might not cover all chromosomes that the provided loci have data for,
225  // in case a region filter was applied, so we want to account for that.
226  // We are also rather strict in the process, to avoid accidental mismatches on the user side.
227  // We count all positions between the beginning of the chromosome, and the end as either
228  // specified in the data, or, if a sequence dict was given to the genome window stream, from that.
229  if( window.is_whole_genome() ) {
230  size_t loci_count = 0;
231  for( auto const& chr_len : window.chromosomes() ) {
232  loci_count += get_chr_loci_count_( chr_len.first, 1, chr_len.second );
233  }
234  return loci_count;
235  }
236 
237  // Here we are in the normal case for all other window types. For those, we just simply use
238  // their first and last positions. For ChromosomeWindowStream, this is similar as for the above
239  // whole genome WindowStream, and is also be based on the sequence dict, if given, or on the data.
240  return get_chr_loci_count_(
241  window.chromosome(), window.first_position(), window.last_position()
242  );
243 }
244 
256 template<class D>
258  WindowAveragePolicy policy,
259  BaseWindow<D> const& window,
260  std::shared_ptr<GenomeLocusSet> provided_loci,
261  VariantFilterStats const& variant_filter_stats,
262  SampleCountsFilterStats const& sample_counts_filter_stats
263 ) {
264  // We cannot have processed more samples than there were passing variants,
265  // as we only should have processed a sample once its variant is found to be passing.
266  // In case of the FST processor, we make sure that only one of the samples gets recorded
267  // in the stats, so this works there as well. We do not simply assert this here,
268  // as a misuse of the filters could result in an error here, which would be on the user
269  // side, and so is an exception.
270  // We skip this test when using the sum anyway, as in those cases, we might not have
271  // the correct filter stats available in the first place.
272  if(
273  policy != WindowAveragePolicy::kSum &&
274  sample_counts_filter_stats.sum() > variant_filter_stats[VariantFilterTag::kPassed]
275  ) {
276  throw std::invalid_argument(
277  "Invalid stat counts with sample_counts_filter_stats.sum() > "
278  "variant_filter_stats[VariantFilterTag::kPassed]"
279  );
280  }
281 
282  // Now select which value we want to return.
283  switch( policy ) {
285  return get_window_length( window );
286  }
288  auto const missing = variant_filter_stats_category_counts(
289  variant_filter_stats, VariantFilterTagCategory::kMissingInvalid
290  );
291  return variant_filter_stats.sum() - missing;
292  }
294  // Here, we use the number of positions that passed all total variant filters
295  // except for being a SNP, as well as the per-sample count of postions that
296  // furthermore passed all sample positions.
297  auto const valid_non_snps = variant_filter_stats_category_counts(
298  variant_filter_stats, VariantFilterTagCategory::kInvariant
299  );
300  auto const valid_snps = sample_counts_filter_stats_category_counts(
301  sample_counts_filter_stats, SampleCountsFilterTagCategory::kPassed
302  );
303  return valid_non_snps + valid_snps;
304  }
306  return sample_counts_filter_stats[SampleCountsFilterTag::kPassed];
307  }
309  return 1.0;
310  }
312  return get_window_provided_loci_count( window, provided_loci );
313  }
314  default: {
315  throw std::invalid_argument( "Invalid WindowAveragePolicy" );
316  }
317  }
318  return std::numeric_limits<double>::quiet_NaN();
319 }
320 
321 } // namespace population
322 } // namespace genesis
323 
324 #endif // include guard
genesis::population::GenomeLocusSet::chromosome_positions
utils::Bitvector const & chromosome_positions(std::string const &chromosome) const
For a given chromosome, return the Bitvector that stores its positions.
Definition: genome_locus_set.hpp:534
genesis::population::FilterStats::sum
size_t sum() const
Definition: filter_stats.hpp:145
genesis::population::get_window_provided_loci_count
size_t get_window_provided_loci_count(BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci)
Get the count of provided loci in a window.
Definition: window_average.hpp:166
base_window.hpp
genesis::population::GenomeLocusSet::has_chromosome
bool has_chromosome(std::string const &chromosome) const
Return whether a chromosome is stored.
Definition: genome_locus_set.hpp:515
window_view.hpp
genesis::population::SampleCountsFilterTagCategory::kPassed
@ kPassed
SampleCounts has passed all filters.
genesis::population::BaseWindow::chromosomes
std::unordered_map< std::string, size_t > const & chromosomes() const
Get the list of all chromosomes along the genome, with their length.
Definition: base_window.hpp:214
genesis::population::variant_filter_stats_category_counts
VariantFilterCategoryStats variant_filter_stats_category_counts(VariantFilterStats const &stats)
Generate summary counts for a VariantFilterStats counter.
Definition: variant_filter.cpp:103
genesis::population::VariantFilterTagCategory::kInvariant
@ kInvariant
Postion is not a SNP according to the SNP filters.
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
sample_counts_filter.hpp
sample_counts.hpp
genesis::population::VariantFilterTagCategory::kMissingInvalid
@ kMissingInvalid
Position is missing or otherwise invalid.
filter_stats.hpp
genesis::population::BaseWindow::width
size_t width() const
Get the width of the Window.
Definition: base_window.hpp:173
genesis::population::BaseWindow::chromosome
std::string const & chromosome() const
Get the chromosome name that this Window belongs to.
Definition: base_window.hpp:90
genesis::population::WindowAveragePolicy::kWindowLength
@ kWindowLength
Use the window length.
genesis::population::WindowAveragePolicy::kSum
@ kSum
Simply report the total sum, with no averaging, i.e., the absolute value of the metric.
genesis::population::BaseWindow::is_whole_genome
bool is_whole_genome() const
Return if this instance is intended to be used for a whole genome stream.
Definition: base_window.hpp:192
genesis::population::sample_counts_filter_stats_category_counts
SampleCountsFilterCategoryStats sample_counts_filter_stats_category_counts(SampleCountsFilterStats const &stats)
Generate summary counts for a SampleCountsFilterStats counter.
Definition: sample_counts_filter.cpp:95
filter_status.hpp
genesis::population::SampleCountsFilterTag::kPassed
@ kPassed
Sample has passed all filters.
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::window_average_denominator
double window_average_denominator(WindowAveragePolicy policy, BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci, VariantFilterStats const &variant_filter_stats, SampleCountsFilterStats const &sample_counts_filter_stats)
Get the denoninator to use for averaging an estimator across a window.
Definition: window_average.hpp:257
genome_locus_set.hpp
genesis::population::WindowAveragePolicy::kAvailableLoci
@ kAvailableLoci
Use the number of positions for which there was data at all, independent of all filter settings....
genesis::population::get_window_length
size_t get_window_length(BaseWindow< D > const &window)
Get the length of a given Window.
Definition: window_average.hpp:146
variant_filter.hpp
genesis::population::WindowAveragePolicy
WindowAveragePolicy
Select the method to use for computing window averages of statistic estimators.
Definition: window_average.hpp:65
genesis::population::WindowAveragePolicy::kValidSnps
@ kValidSnps
Use the number of SNPs only.
genesis::population::BaseWindow::first_position
size_t first_position() const
Get the first position in the chromosome of the Window, that is, where the Window starts.
Definition: base_window.hpp:114
genesis::population::WindowAveragePolicy::kValidLoci
@ kValidLoci
Use the number of positions that passed all quality and numerical filters, excluding the SNP-related ...
genesis::population::FilterStats
Counts of how many entries with a particular Filter Tag occured in some data.
Definition: filter_stats.hpp:61
variant.hpp
genesis::population::BaseWindow::last_position
size_t last_position() const
Get the last position in the chromosome of the Window, that is, where the Window ends.
Definition: base_window.hpp:134
genesis::population::BaseWindow
Base class for Window and WindowView, to share common functionality.
Definition: base_window.hpp:60
genesis::population::WindowAveragePolicy::kProvidedLoci
@ kProvidedLoci
Use exactly the provided loci as set in the window of a GenomeLocusSet.
genesis::population::VariantFilterTag::kPassed
@ kPassed
Variant has passed all filters.