A library for working with phylogenetic and population genetic data.
v0.32.0
variant_filter_numerical.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
36 
37 #include <cassert>
38 #include <cmath>
39 #include <iostream>
40 #include <stdexcept>
41 #include <sstream>
42 
43 namespace genesis {
44 namespace population {
45 
46 // =================================================================================================
47 // Variant Filter Numerical
48 // =================================================================================================
49 
51  Variant& variant,
52  VariantFilterNumericalParams const& params,
53  VariantFilterStats& stats
54 ) {
55  // We do not filter further if the position has already been determined to be filered out.
56  if( ! variant.status.passing() ) {
57  return false;
58  }
59 
60  // Get the combined sum of all (passing) samples. Needed for all below.
61  auto const total = merge_sample_counts( variant, SampleCountsFilterPolicy::kOnlyPassing );
62 
63  // -------------------------------------------
64  // Numeric
65  // -------------------------------------------
66 
67  // Empty variants where all samples have zero counts are not interesting and filtered.
68  auto const sum = nucleotide_sum( total );
69  if( sum == 0 ) {
71  ++stats[VariantFilterTag::kEmpty];
72  return false;
73  }
74 
75  // ReadDepth
76  if( params.min_read_depth > 0 && sum < params.min_read_depth ) {
79  return false;
80  }
81  if( params.max_read_depth > 0 && sum > params.max_read_depth ) {
84  return false;
85  }
86 
87  // Check deletions, independently of the SNP status.
88  if(
89  params.deletions_count_limit > 0
90  && total.d_count > 0
91  && total.d_count >= params.deletions_count_limit
92  ) {
95  return false;
96  }
97 
98  // -------------------------------------------
99  // SNP vs Invariant
100  // -------------------------------------------
101 
102  // Everything below from here is only applied if we filter for SNPs
103  if( ! params.only_snps && ! params.only_biallelic_snps ) {
104  return true;
105  }
106 
107  // SNPs
108  if( params.snp_min_count == 0 && params.snp_max_count == 0 ) {
109  // Check without any min or max counts. Just look for pure SNPs.
110  // Has to be separated from the min/max count cases, as we might have minor allele
111  // snps that we want to ignore, but which would be considered here,
112  // for instance when deciding if a position is biallelic.
113  auto const al_count = allele_count( total );
114  if( params.only_snps && al_count < 2 ) {
116  ++stats[VariantFilterTag::kNotSnp];
117  return false;
118  }
119  if( params.only_biallelic_snps && al_count != 2 ) {
122  return false;
123  }
124  } else {
125  // Check with just the min count applied.
126  // We check the two filters separately here, to be able to increment the correct counter.
127  auto const al_count_min = allele_count(
128  total, params.snp_min_count
129  );
130  if( params.only_snps && al_count_min < 2 ) {
133  return false;
134  }
135  if( params.only_biallelic_snps && al_count_min != 2 ) {
138  return false;
139  }
140 
141  // And again, this time also considering the max count setting.
142  if( params.snp_max_count > 0 ) {
143  auto const al_count_min_max = allele_count(
144  total, params.snp_min_count, params.snp_max_count
145  );
146  if( params.only_snps && al_count_min_max < 2 ) {
149  return false;
150  }
151  if( params.only_biallelic_snps && al_count_min_max != 2 ) {
154  return false;
155  }
156  }
157  }
158 
159  // -------------------------------------------
160  // Allele frequency
161  // -------------------------------------------
162 
163  // Frequency
164  assert( params.only_snps || params.only_biallelic_snps );
165  if( params.snp_min_allele_frequency != 0.0 ) {
166  // Input setting sanity check
167  if(
168  ! std::isfinite( params.snp_min_allele_frequency ) ||
169  params.snp_min_allele_frequency < 0.0 ||
170  params.snp_min_allele_frequency > 1.0
171  ) {
172  throw std::runtime_error(
173  "Invalid VariantFilterNumericalParams::snp_min_allele_frequency == " +
175  );
176  }
177 
178  // Get the frequency, based on whether we can uses the bases or not.
179  auto const ref_base = utils::to_upper( variant.reference_base );
180  auto const alt_base = utils::to_upper( variant.alternative_base );
181  size_t ref_cnt = 0;
182  size_t alt_cnt = 0;
183  if( ! is_valid_base( ref_base )) {
184 
185  // Invalid ref base, will use counts to determine frequency.
186  auto const sorted_counts = sorted_sample_counts( total );
187  ref_cnt = sorted_counts[0].count;
188  alt_cnt = sorted_counts[1].count;
189 
190  } else if( ! is_valid_base( alt_base )) {
191 
192  // Valid ref base, but invalid alt base. Will use ref base and second most common count.
193  assert( is_valid_base( ref_base ));
194  auto const sorted_counts = sorted_sample_counts(
196  );
197  ref_cnt = sorted_counts[0].count;
198  alt_cnt = sorted_counts[1].count;
199 
200  } else {
201 
202  // Both ref and alt base are valid.
203  assert( is_valid_base( ref_base ));
204  assert( is_valid_base( alt_base ));
205 
206  // Use them to determine frequency.
207  ref_cnt = get_base_count( total, ref_base );
208  alt_cnt = get_base_count( total, alt_base );
209  }
210 
211  // Compute the frequency.
212  auto const cnt_sum = ref_cnt + alt_cnt;
213  auto const frequency = static_cast<double>( ref_cnt ) / static_cast<double>( cnt_sum );
214  assert( ! std::isfinite( frequency ) || ( frequency >= 0.0 && frequency <= 1.0 ));
215 
216  // Now do the filtering
217  if(
218  ! std::isfinite( frequency ) ||
219  frequency < params.snp_min_allele_frequency ||
220  1.0 - frequency < params.snp_min_allele_frequency
221  ) {
224  return false;
225  }
226  }
227 
228  // Everything passed.
229  return true;
230 }
231 
233  Variant& variant,
234  VariantFilterNumericalParams const& params
235 ) {
236  VariantFilterStats stats{};
237  return apply_variant_filter_numerical( variant, params, stats );
238 }
239 
240 } // namespace population
241 } // namespace genesis
genesis::population::VariantFilterNumericalParams::snp_max_count
size_t snp_max_count
Maximum count for each nucleotide to be considered a SNP for the whole Variant.
Definition: variant_filter_numerical.hpp:139
genesis::population::VariantFilterTag::kAboveSnpMaxCount
@ kAboveSnpMaxCount
Sum of nucleotides is above VariantFilterNumericalParams::snp_max_count.
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
functions.hpp
genesis::population::VariantFilterTag::kBelowMinAlleleFreq
@ kBelowMinAlleleFreq
Did not reach minimum allele frequency.
genesis::population::VariantFilterNumericalParams
Definition: variant_filter_numerical.hpp:55
genesis::population::nucleotide_sum
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:296
genesis::population::VariantFilterNumericalParams::snp_min_count
size_t snp_min_count
Minimum count for each nucleotide to be considered a SNP for the whole Variant.
Definition: variant_filter_numerical.hpp:128
genesis::population::allele_count
size_t allele_count(SampleCounts const &sample)
Return the number of alleles, that is, of non-zero nucleotide counts of the sample.
Definition: population/function/functions.cpp:309
genesis::population::VariantFilterNumericalParams::only_biallelic_snps
bool only_biallelic_snps
Filter if the Variant does not have exactly two alleles.
Definition: variant_filter_numerical.hpp:118
genesis::population::Variant::reference_base
char reference_base
Definition: variant.hpp:79
genesis::population::get_base_count
SampleCounts::size_type get_base_count(SampleCounts const &sample, char base)
Get the count for a base given as a char.
Definition: population/function/functions.cpp:50
genesis::population::VariantFilterTag::kNotBiallelicSnp
@ kNotBiallelicSnp
SNP position, but not biallelic, i.e., has more than one alternative.
genesis::population::VariantFilterNumericalParams::deletions_count_limit
size_t deletions_count_limit
Maximum number of deletions at a position before being filtered out.
Definition: variant_filter_numerical.hpp:92
genesis::population::FilterStatus::set
void set(IntType value)
Definition: filter_status.hpp:100
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::population::VariantFilterTag::kEmpty
@ kEmpty
All counts across all samples are zero.
genesis::population::VariantFilterNumericalParams::max_read_depth
size_t max_read_depth
Maximum read depth expected for the whole Variant to be considered covered.
Definition: variant_filter_numerical.hpp:81
genesis::population::VariantFilterTag::kNotSnp
@ kNotSnp
Invariant position, not a SNP.
genesis::population::merge_sample_counts
SampleCounts merge_sample_counts(Variant const &v, SampleCountsFilterPolicy filter_policy)
Merge the counts of a vector SampleCountss.
Definition: population/function/functions.hpp:282
genesis::population::is_valid_base
constexpr bool is_valid_base(char c)
Return whether a given base is in ACGT, case insensitive.
Definition: population/function/functions.hpp:56
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::FilterStatus::passing
bool passing() const
Definition: filter_status.hpp:75
genesis::population::VariantFilterTag::kBelowMinReadDepth
@ kBelowMinReadDepth
Sum of counts across all samples is below the min read depth threshold.
genesis::population::VariantFilterNumericalParams::min_read_depth
size_t min_read_depth
Minimum read depth expected for the whole Variant to be considered covered.
Definition: variant_filter_numerical.hpp:69
genesis::population::SampleCountsFilterPolicy::kOnlyPassing
@ kOnlyPassing
variant_filter.hpp
char.hpp
genesis::population::VariantFilterTag::kAboveDeletionsCountLimit
@ kAboveDeletionsCountLimit
Too many deletions at the position.
variant_filter_numerical.hpp
genesis::population::FilterStats
Counts of how many entries with a particular Filter Tag occured in some data.
Definition: filter_stats.hpp:61
genesis::population::Variant::alternative_base
char alternative_base
Definition: variant.hpp:80
genesis::population::apply_variant_filter_numerical
bool apply_variant_filter_numerical(Variant &variant, VariantFilterNumericalParams const &params, VariantFilterStats &stats)
Filter a given Variant based on the numerical properties of the counts.
Definition: variant_filter_numerical.cpp:50
genesis::population::sorted_sample_counts
SortedSampleCounts sorted_sample_counts(SampleCounts const &sample)
Return the order of base counts (nucleotides), largest one first.
Definition: population/function/functions.cpp:134
genesis::population::VariantFilterTag::kBelowSnpMinCount
@ kBelowSnpMinCount
Sum of nucleotides is below VariantFilterNumericalParams::snp_min_count.
genesis::population::VariantFilterNumericalParams::only_snps
bool only_snps
Filter if the Variant does not have two or more alleles.
Definition: variant_filter_numerical.hpp:106
genesis::population::VariantFilterTag::kAboveMaxReadDepth
@ kAboveMaxReadDepth
Sum of counts across all samples is above the max read depth threshold.
genesis::population::VariantFilterNumericalParams::snp_min_allele_frequency
double snp_min_allele_frequency
Minimum allele frequency that needs to be achieved.
Definition: variant_filter_numerical.hpp:154
genesis::population::Variant::status
FilterStatus status
Definition: variant.hpp:76