A library for working with phylogenetic and population genetic data.
v0.32.0
sample_counts_filter_numerical.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
36 
37 #include <cassert>
38 #include <cmath>
39 #include <iostream>
40 #include <stdexcept>
41 #include <sstream>
42 
43 namespace genesis {
44 namespace population {
45 
46 // =================================================================================================
47 // Transform by Count
48 // =================================================================================================
49 
51  SampleCounts& sample,
52  size_t min_count,
53  bool also_n_and_d_counts
54 ) {
55  // Reset counts if needed, according to min count setting.
56  sample.a_count = sample.a_count < min_count ? 0 : sample.a_count;
57  sample.c_count = sample.c_count < min_count ? 0 : sample.c_count;
58  sample.g_count = sample.g_count < min_count ? 0 : sample.g_count;
59  sample.t_count = sample.t_count < min_count ? 0 : sample.t_count;
60 
61  // Also for the two others, if needed.
62  if( also_n_and_d_counts ) {
63  sample.n_count = sample.n_count < min_count ? 0 : sample.n_count;
64  sample.d_count = sample.d_count < min_count ? 0 : sample.d_count;
65  }
66 }
67 
69  Variant& variant,
70  size_t min_count,
71  bool also_n_and_d_counts
72 ) {
73  for( auto& sample : variant.samples ) {
74  transform_zero_out_by_min_count( sample, min_count, also_n_and_d_counts );
75  }
76 }
77 
79  SampleCounts& sample,
80  size_t max_count,
81  bool also_n_and_d_counts
82 ) {
83  // Need an extra check here first.
84  if( max_count == 0 ) {
85  return;
86  }
87 
88  // Reset counts if needed, according to max count setting.
89  sample.a_count = sample.a_count > max_count ? 0 : sample.a_count;
90  sample.c_count = sample.c_count > max_count ? 0 : sample.c_count;
91  sample.g_count = sample.g_count > max_count ? 0 : sample.g_count;
92  sample.t_count = sample.t_count > max_count ? 0 : sample.t_count;
93 
94  // Also for the two others, if needed.
95  if( also_n_and_d_counts ) {
96  sample.n_count = sample.n_count > max_count ? 0 : sample.n_count;
97  sample.d_count = sample.d_count > max_count ? 0 : sample.d_count;
98  }
99 }
100 
102  Variant& variant,
103  size_t max_count,
104  bool also_n_and_d_counts
105 ) {
106  for( auto& sample : variant.samples ) {
107  transform_zero_out_by_max_count( sample, max_count, also_n_and_d_counts );
108  }
109 }
110 
111 // =================================================================================================
112 // Sample Counts Filter Numerical
113 // =================================================================================================
114 
116  SampleCounts& sample,
117  SampleCountsFilterNumericalParams const& params,
119 ) {
120  // We do not filter further if the sample has already been determined to be filered out.
121  if( ! sample.status.passing() ) {
122  return false;
123  }
124 
125  // -------------------------------------------
126  // Numeric
127  // -------------------------------------------
128 
129  // Counts
130  if( params.min_count > 0 ) {
131  transform_zero_out_by_min_count( sample, params.min_count );
132  }
133  if( params.max_count > 0 ) {
134  transform_zero_out_by_max_count( sample, params.max_count );
135  }
136 
137  // Check deletions.
138  if(
139  params.deletions_count_limit > 0
140  && sample.d_count > 0
141  && sample.d_count >= params.deletions_count_limit
142  ) {
143  // if( params.clear_filtered ) {
144  // sample.clear();
145  // }
146 
149  return false;
150  }
151 
152  // Empty?
153  auto const sum = nucleotide_sum( sample );
154  if( sum == 0 ) {
155  // if( params.clear_filtered ) {
156  // sample.clear();
157  // }
158 
161  return false;
162  }
163 
164  // Read depth
165  if( sum < params.min_read_depth ) {
166  // if( params.clear_filtered ) {
167  // sample.clear();
168  // }
169 
172  return false;
173  }
174  if( params.max_read_depth > 0 && sum > params.max_read_depth ) {
175  // if( params.clear_filtered ) {
176  // sample.clear();
177  // }
178 
181  return false;
182  }
183 
184  // -------------------------------------------
185  // SNP vs Invariant
186  // -------------------------------------------
187 
188  // SNPs
189  if( params.only_snps || params.only_biallelic_snps ) {
190 
191  // Determine type of SNP.
192  auto const al_count = allele_count( sample );
193  if( params.only_snps && al_count < 2 ) {
194  // if( params.clear_filtered ) {
195  // sample.clear();
196  // }
197 
200  return false;
201  }
202  if( params.only_biallelic_snps && al_count != 2 ) {
203  // if( params.clear_filtered ) {
204  // sample.clear();
205  // }
206 
209  return false;
210  }
211  }
212 
213  // Everything passed.
214  return true;
215 }
216 
218  SampleCounts& sample,
220 ) {
221  SampleCountsFilterStats stats{};
222  return apply_sample_counts_filter_numerical( sample, params, stats );
223 }
224 
225 // =================================================================================================
226 // Variant Sample Counts Filter Numerical
227 // =================================================================================================
228 
230  Variant& variant,
231  SampleCountsFilterNumericalParams const& params,
232  VariantFilterStats& variant_stats,
233  SampleCountsFilterStats& sample_count_stats,
234  bool all_need_pass
235 ) {
236  // We do not filter further if the position has already been determined to be filered out.
237  if( ! variant.status.passing() ) {
238  return false;
239  }
240 
241  // Apply the filter to all samples, and count how many passed.
242  size_t passed_cnt = 0;
243  for( auto& sample : variant.samples ) {
244  auto const passed = apply_sample_counts_filter_numerical(
245  sample, params, sample_count_stats
246  );
247  if( passed ) {
248  ++passed_cnt;
249  }
250  }
251 
252  // Use that number to decide if the whole Variant is passing or not.
253  // If no sample passed, that's a fail for the Variant in either case as well.
254  if( passed_cnt == 0 ) {
256  ++variant_stats[VariantFilterTag::kNoSamplePassed];
257  return false;
258  }
259 
260  // If all samples need to pass, we need an extra check.
261  if( all_need_pass ) {
262  if( passed_cnt == variant.samples.size() ) {
263  return true;
264  } else {
267  return false;
268  }
269  }
270 
271  // Here, some samples passed, and we are not requiring that all pass, so that's a pass overall.
272  assert( passed_cnt > 0 );
273  assert( !all_need_pass );
274  return true;
275 }
276 
278  Variant& variant,
279  SampleCountsFilterNumericalParams const& params,
280  bool all_need_pass
281 ) {
282  VariantFilterStats variant_stats{};
283  SampleCountsFilterStats sample_count_stats{};
285  variant, params, variant_stats, sample_count_stats, all_need_pass
286  );
287 }
288 
289 } // namespace population
290 } // namespace genesis
genesis::population::SampleCountsFilterTag::kAboveDeletionsCountLimit
@ kAboveDeletionsCountLimit
Too many deletions at the position.
genesis::population::transform_zero_out_by_max_count
void transform_zero_out_by_max_count(SampleCounts &sample, size_t max_count, bool also_n_and_d_counts)
Transform a SampleCounts sample by setting any nucleotide count (A, C, G, T) to zero if max_count is ...
Definition: sample_counts_filter_numerical.cpp:78
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::population::SampleCountsFilterNumericalParams
Filter settings to filter and transform SampleCounts.
Definition: sample_counts_filter_numerical.hpp:123
functions.hpp
sample_counts_filter_numerical.hpp
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
genesis::population::SampleCounts::a_count
size_type a_count
Count of all A nucleotides that are present in the sample.
Definition: sample_counts.hpp:66
genesis::population::nucleotide_sum
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:296
genesis::population::allele_count
size_t allele_count(SampleCounts const &sample)
Return the number of alleles, that is, of non-zero nucleotide counts of the sample.
Definition: population/function/functions.cpp:309
genesis::population::SampleCounts::status
FilterStatus status
Status to indicate whether any applied filters failed to pass.
Definition: sample_counts.hpp:96
genesis::population::FilterStatus::set
void set(IntType value)
Definition: filter_status.hpp:100
sample_counts_filter.hpp
genesis::population::SampleCounts::t_count
size_type t_count
Count of all T nucleotides that are present in the sample.
Definition: sample_counts.hpp:81
genesis::population::SampleCountsFilterNumericalParams::min_count
size_t min_count
Minimum count for each nucleotide to be considered. All counts below are set to zero.
Definition: sample_counts_filter_numerical.hpp:134
genesis::population::apply_sample_counts_filter_numerical
bool apply_sample_counts_filter_numerical(SampleCounts &sample, SampleCountsFilterNumericalParams const &params, SampleCountsFilterStats &stats)
Filter a given SampleCounts based on the numerical properties of the counts.
Definition: sample_counts_filter_numerical.cpp:115
genesis::population::SampleCountsFilterTag::kAboveMaxReadDepth
@ kAboveMaxReadDepth
Sum of counts across all nucleotide counts is above the max read depth threshold.
genesis::population::SampleCountsFilterNumericalParams::max_count
size_t max_count
Maximum count for each nucleotide to be considered. All counts above are set to zero.
Definition: sample_counts_filter_numerical.hpp:141
genesis::population::SampleCounts::n_count
size_type n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
Definition: sample_counts.hpp:86
genesis::population::SampleCounts::c_count
size_type c_count
Count of all C nucleotides that are present in the sample.
Definition: sample_counts.hpp:71
genesis::population::SampleCountsFilterTag::kBelowMinReadDepth
@ kBelowMinReadDepth
Sum of counts across all nucleotide counts is below the min read depth threshold.
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis::population::SampleCountsFilterTag::kEmpty
@ kEmpty
Zero nucleotide counts, after zeroing out counts based on the min_count and max_count.
genesis::population::transform_zero_out_by_min_count
void transform_zero_out_by_min_count(SampleCounts &sample, size_t min_count, bool also_n_and_d_counts)
Transform a SampleCounts sample by setting any nucleotide count (A, C, G, T) to zero if min_count is ...
Definition: sample_counts_filter_numerical.cpp:50
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::FilterStatus::passing
bool passing() const
Definition: filter_status.hpp:75
genesis::population::SampleCountsFilterNumericalParams::deletions_count_limit
size_t deletions_count_limit
Maximum number of deletions at a position before being filtered out.
Definition: sample_counts_filter_numerical.hpp:152
char.hpp
genesis::population::VariantFilterTag::kNoSamplePassed
@ kNoSamplePassed
None of the SampleCounts of the Variant passed their filters.
genesis::population::SampleCountsFilterTag::kNotSnp
@ kNotSnp
Invariant position, not a SNP.
genesis::population::VariantFilterTag::kNotAllSamplesPassed
@ kNotAllSamplesPassed
Some of the SampleCounts of the Variant did not pass their filters.
genesis::population::Variant::samples
std::vector< SampleCounts > samples
Definition: variant.hpp:82
genesis::population::FilterStats
Counts of how many entries with a particular Filter Tag occured in some data.
Definition: filter_stats.hpp:61
genesis::population::SampleCountsFilterNumericalParams::only_biallelic_snps
bool only_biallelic_snps
Filter if the sample does not have exactly two alleles.
Definition: sample_counts_filter_numerical.hpp:200
genesis::population::SampleCounts::d_count
size_type d_count
Count of all deleted (*) nucleotides that are present in the sample.
Definition: sample_counts.hpp:91
genesis::population::SampleCountsFilterNumericalParams::only_snps
bool only_snps
Filter if the sample does not have two or more alleles.
Definition: sample_counts_filter_numerical.hpp:189
genesis::population::SampleCountsFilterNumericalParams::max_read_depth
size_t max_read_depth
Maximum read depth expected for a SampleCounts to be considered covered.
Definition: sample_counts_filter_numerical.hpp:174
genesis::population::SampleCountsFilterNumericalParams::min_read_depth
size_t min_read_depth
Minimum read depth expected for a SampleCounts to be considered covered.
Definition: sample_counts_filter_numerical.hpp:163
genesis::population::SampleCountsFilterTag::kNotBiallelicSnp
@ kNotBiallelicSnp
SNP position, but not biallelic, i.e., has more than one alternative.
genesis::population::Variant::status
FilterStatus status
Definition: variant.hpp:76
genesis::population::SampleCounts::g_count
size_type g_count
Count of all G nucleotides that are present in the sample.
Definition: sample_counts.hpp:76