A library for working with phylogenetic and population genetic data.
v0.27.0
filter_transform.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTIONS_FILTER_TRANSFORM_H_
2 #define GENESIS_POPULATION_FUNCTIONS_FILTER_TRANSFORM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
39 
40 #include <functional>
41 #include <memory>
42 #include <stdexcept>
43 #include <string>
44 #include <utility>
45 #include <vector>
46 
47 namespace genesis {
48 namespace population {
49 
50 // =================================================================================================
51 // Filter Helpers
52 // =================================================================================================
53 
58 enum class SampleFilterType
59 {
65 
71 
78  kMerge
79 };
80 
81 // =================================================================================================
82 // Filter by Status
83 // =================================================================================================
84 
103 bool filter_by_status(
104  std::function<bool(BaseCountsStatus const&)> predicate,
105  Variant const& variant,
106  SampleFilterType type,
107  size_t min_coverage = 0,
108  size_t max_coverage = 0,
109  size_t min_count = 0,
110  bool tolerate_deletions = false
111 );
112 
123 inline std::function<bool(Variant const&)> filter_by_status(
124  std::function<bool(BaseCountsStatus const&)> predicate,
125  SampleFilterType type,
126  size_t min_coverage = 0,
127  size_t max_coverage = 0,
128  size_t min_count = 0,
129  bool tolerate_deletions = false
130 ) {
131  return [=]( Variant const& variant ){
132  return filter_by_status(
133  predicate, variant, type, min_coverage, max_coverage, min_count, tolerate_deletions
134  );
135  };
136 }
137 
152 inline bool filter_is_snp(
153  Variant const& variant,
154  SampleFilterType type,
155  size_t min_coverage = 0,
156  size_t max_coverage = 0,
157  size_t min_count = 0,
158  bool tolerate_deletions = false
159 ) {
160  return filter_by_status(
161  []( BaseCountsStatus const& stat ){
162  return stat.is_covered && stat.is_snp;
163  },
164  variant, type, min_coverage, max_coverage, min_count, tolerate_deletions
165  );
166 }
167 
178 inline std::function<bool(Variant const&)> filter_is_snp(
179  SampleFilterType type,
180  size_t min_coverage = 0,
181  size_t max_coverage = 0,
182  size_t min_count = 0,
183  bool tolerate_deletions = false
184 ) {
185  return [=]( Variant const& variant ){
186  return filter_is_snp(
187  variant, type, min_coverage, max_coverage, min_count, tolerate_deletions
188  );
189  };
190 }
191 
205  Variant const& variant,
206  SampleFilterType type,
207  size_t min_coverage = 0,
208  size_t max_coverage = 0,
209  size_t min_count = 0,
210  bool tolerate_deletions = false
211 ) {
212  return filter_by_status(
213  []( BaseCountsStatus const& stat ){
214  return stat.is_covered && stat.is_snp && stat.is_biallelic;
215  },
216  variant, type, min_coverage, max_coverage, min_count, tolerate_deletions
217  );
218 }
219 
230 inline std::function<bool(Variant const&)> filter_is_biallelic_snp(
231  SampleFilterType type,
232  size_t min_coverage = 0,
233  size_t max_coverage = 0,
234  size_t min_count = 0,
235  bool tolerate_deletions = false
236 ) {
237  return [=]( Variant const& variant ){
239  variant, type, min_coverage, max_coverage, min_count, tolerate_deletions
240  );
241  };
242 }
243 
244 // =================================================================================================
245 // Filter by Count
246 // =================================================================================================
247 
248 // TODO
249 
250 // variant_filter_min_coverage
251 // variant_filter_max_coverage
252 // variant_filter_min_max_coverage
253 //
254 // variant_filter_min_frequency
255 // variant_filter_max_frequency
256 // variant_filter_min_max_frequency
257 
258 // bool filter_by_min_maf_count( BaseCounts const& sample, size_t min_count );
259 //
260 // bool filter_by_min_maf_count( Variant const& variant, size_t min_count, SampleFilterType type );
261 //
262 // bool filter_by_max_count( BaseCounts const& sample, size_t max_count );
263 //
264 // bool filter_by_max_count( Variant const& variant, size_t max_count, SampleFilterType type );
265 
266 // =================================================================================================
267 // Filter by Region
268 // =================================================================================================
269 
277 inline std::function<bool(Variant const&)> filter_by_region(
278  GenomeRegion const& region,
279  bool complement = false
280 ) {
281  return [region, complement]( Variant const& variant ){
282  return complement ^ is_covered( region, variant );
283  };
284 }
285 
293 inline std::function<bool(Variant const&)> filter_by_region(
294  std::shared_ptr<GenomeRegionList> regions,
295  bool complement = false
296 ) {
297  if( ! regions ) {
298  throw std::invalid_argument(
299  "Can only used filter_by_region() with a valid shared pointer to a GenomeRegionList."
300  );
301  }
302  return [regions, complement]( Variant const& variant ){
303  return complement ^ is_covered( *regions, variant );
304  };
305 }
306 
316 inline std::function<bool(Variant const&)> filter_by_region(
317  GenomeRegionList const& regions,
318  bool complement = false,
319  bool copy_regions = false
320 ) {
321  if( copy_regions ) {
322  return [regions, complement]( Variant const& variant ){
323  return complement ^ is_covered( regions, variant );
324  };
325  } else {
326  return [&regions, complement]( Variant const& variant ){
327  return complement ^ is_covered( regions, variant );
328  };
329  }
330 }
331 
332 // =================================================================================================
333 // Transformations
334 // =================================================================================================
335 
336 // bascially, all of the above filters, but as transforms that set stuff to 0 intead of filtering
337 //
338 // inline std::functiom<void(Variant&)> variant_transform_min_counts( size_t min_count )
339 // {
340 // return [min_count]( Variant& variant ){
341 // for( auto& sample : variant.samples ) {
342 // transform_min_count( sample, min_count );
343 // --> add this function for variants as well first, and use this
344 // (basically just a loop over the other one)
345 // -->> also make this for max and min max, and use these.
346 //
347 // --->> then, these already have the function signature that is needed for the iterator~
348 // no need to do a lambda that just calles it!
349 // --> ah no, because we need to capture the min count setting....
350 // }
351 // };
352 // }
353 //
354 // min count to 0
355 // max count to 0
356 // min max count to 0
357 
358 // =================================================================================================
359 // Transform by Count
360 // =================================================================================================
361 
370 void transform_zero_out_by_min_count( BaseCounts& sample, size_t min_count );
371 
376 void transform_zero_out_by_min_count( Variant& variant, size_t min_count );
377 
386 void transform_zero_out_by_max_count( BaseCounts& sample, size_t max_count );
387 
392 void transform_zero_out_by_max_count( Variant& variant, size_t max_count );
393 
401 void transform_zero_out_by_min_max_count( BaseCounts& sample, size_t min_count, size_t max_count );
402 
407 void transform_zero_out_by_min_max_count( Variant& variant, size_t min_count, size_t max_count );
408 
409 } // namespace population
410 } // namespace genesis
411 
412 #endif // include guard
base_counts.hpp
genesis::population::BaseCountsStatus
Definition: population/functions/functions.hpp:50
genesis::population::SampleFilterType::kConjunction
@ kConjunction
The filter returns true only if all of the BaseCounts samples in the Variant return true for a given ...
genesis::population::BaseCountsStatus::is_biallelic
bool is_biallelic
Is the Sample biallelic?
Definition: population/functions/functions.hpp:89
genesis::population::SampleFilterType
SampleFilterType
Select how Variant filter functions that evaluate properties of the Variant::samples (BaseCounts) obj...
Definition: filter_transform.hpp:58
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:82
genesis::population::is_covered
bool is_covered(GenomeRegion const &region, std::string const &chromosome, size_t position)
Test whether the chromosome/position is within a given genomic region.
Definition: genome_region.cpp:190
genesis::population::SampleFilterType::kMerge
@ kMerge
The filter is applied to the merged BaseCounts of all samples in the Variant.
genesis::population::filter_by_status
bool filter_by_status(std::function< bool(BaseCountsStatus const &)> predicate, Variant const &variant, SampleFilterType type, size_t min_coverage, size_t max_coverage, size_t min_count, bool tolerate_deletions)
Filter a Variant based on a predicate that is applied to the result of a status() call on the BaseCou...
Definition: filter_transform.cpp:43
genome_region.hpp
genesis::population::BaseCountsStatus::is_snp
bool is_snp
Does the Sample have two or more alleles?
Definition: population/functions/functions.hpp:77
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GenomeRegion
A region (between two positions) on a chromosome.
Definition: genome_region.hpp:60
genesis::population::filter_is_snp
bool filter_is_snp(Variant const &variant, SampleFilterType type, size_t min_coverage=0, size_t max_coverage=0, size_t min_count=0, bool tolerate_deletions=false)
Filter a Variant based on whether the sample counts are SNPs, that is, more than one count in [ACGT] ...
Definition: filter_transform.hpp:152
genesis::population::transform_zero_out_by_min_max_count
void transform_zero_out_by_min_max_count(BaseCounts &sample, size_t min_count, size_t max_count)
Transform a BaseCounts sample by setting any nucleotide count (A, C, G, T) to zero if min_count is no...
Definition: filter_transform.cpp:129
genesis::population::transform_zero_out_by_min_count
void transform_zero_out_by_min_count(BaseCounts &sample, size_t min_count)
Transform a BaseCounts sample by setting any nucleotide count (A, C, G, T) to zero if min_count is no...
Definition: filter_transform.cpp:77
genesis::population::SampleFilterType::kDisjunction
@ kDisjunction
The filter returns true if any of the BaseCounts samples in the Variant return true for a given predi...
genesis::population::transform_zero_out_by_max_count
void transform_zero_out_by_max_count(BaseCounts &sample, size_t max_count)
Transform a BaseCounts sample by setting any nucleotide count (A, C, G, T) to zero if max_count is ex...
Definition: filter_transform.cpp:101
variant.hpp
genome_region.hpp
genesis::population::filter_is_biallelic_snp
bool filter_is_biallelic_snp(Variant const &variant, SampleFilterType type, size_t min_coverage=0, size_t max_coverage=0, size_t min_count=0, bool tolerate_deletions=false)
Filter a Variant based on whether the sample counts are biallelic SNPs, that is, exactly two base cou...
Definition: filter_transform.hpp:204
functions.hpp
genesis::population::filter_by_region
std::function< bool(Variant const &)> filter_by_region(GenomeRegion const &region, bool complement=false)
Filter function to be used with VariantInputIterator to filter by a genome region.
Definition: filter_transform.hpp:277
genesis::population::BaseCountsStatus::is_covered
bool is_covered
Is the Sample covered by enough reads/nucleotides?
Definition: population/functions/functions.hpp:65