A library for working with phylogenetic and population genetic data.
v0.32.0
variant_input_stream.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
34 
35 #include <cassert>
36 #include <stdexcept>
37 #include <unordered_set>
38 
39 namespace genesis {
40 namespace population {
41 
42 // =================================================================================================
43 // Sample Name Filter
44 // =================================================================================================
45 
46 std::vector<bool> make_sample_name_filter(
47  std::vector<std::string> const& sample_names,
48  std::vector<std::string> const& names_filter,
49  bool inverse_filter
50 ) {
51  // Turn filter into set for fast access. We don't do a simple iterator-based copy here,
52  // as we at the same time also want to check for duplicates.
53  std::unordered_set<std::string> filter_set;
54  for( auto const& name : names_filter ) {
55  if( filter_set.count( name ) != 0 ) {
56  throw std::invalid_argument(
57  "Cannot apply sample name filter, as filter name \"" + name + "\" appears "
58  "multiple times in the list of names used for filtering."
59  );
60  }
61  filter_set.insert( name );
62  }
63  assert( filter_set.size() == names_filter.size() );
64 
65  // Now go through the names, and check if they are to be filtered or not. At the same time,
66  // we also build a set of those names, for duplication check. Whenever we have processed a name
67  // from the filter set, we remove it there, so that if anything remains at the end, we know
68  // that it did not appear in the sample names list.
69  std::unordered_set<std::string> names_set;
70  auto result = std::vector<bool>( sample_names.size(), false );
71  for( size_t i = 0; i < sample_names.size(); ++i ) {
72  auto const& name = sample_names[i];
73 
74  // Duplicate check.
75  if( names_set.count( name ) > 0 ) {
76  throw std::invalid_argument(
77  "Cannot apply sample name filter, as sample name \"" + name + "\" appears "
78  "multiple times in the sample names."
79  );
80  }
81  names_set.insert( name );
82 
83  // Filter, and remove from filter.
84  bool const found = ( filter_set.count( name ) > 0 );
85  result[i] = ( found ^ inverse_filter );
86  if( found ) {
87  filter_set.erase( name );
88  }
89  }
90 
91  // Check if there are any remaining names in the filter list. If so, that's an error.
92  if( filter_set.size() > 0 ) {
93  throw std::invalid_argument(
94  "Cannot apply sample name filter, as the list of names to filter contains names that "
95  "do not appear in the sample names, such as \"" + *filter_set.begin() + "\"."
96  );
97  }
98 
99  // All good, return the filter vector.
100  return result;
101 }
102 
104  std::vector<bool> const& sample_filter
105 ) {
106  // We do a pop count once, and store that result in the lambda, so that we do not need
107  // to recompute this every time that the filter is being used.
108  size_t pop_count = 0;
109  for( size_t i = 0; i < sample_filter.size(); ++i ) {
110  if( sample_filter[i] ) {
111  ++pop_count;
112  }
113  }
114  assert( pop_count <= sample_filter.size() );
115 
116  // We make a copy of the filter, to be sure that it is alive when needed.
117  return [ sample_filter, pop_count ]( Variant& variant ){
118  if( variant.samples.size() != sample_filter.size() ) {
119  throw std::runtime_error(
120  "Invalid sample filter, which filters a list of " +
121  std::to_string( sample_filter.size() ) + " samples, while the Variant has " +
122  std::to_string( variant.samples.size() ) + " samples instead."
123  );
124  }
125 
126  // Allocate a samples vector of the right size, and move samples around.
127  std::vector<SampleCounts> samples;
128  samples.reserve( pop_count );
129  for( size_t i = 0; i < sample_filter.size(); ++i ) {
130  if( sample_filter[i] ) {
131  samples.push_back( std::move( variant.samples[i] ));
132  }
133  }
134  assert( samples.size() == pop_count );
135  variant.samples = std::move( samples );
136  };
137 }
138 
139 // =================================================================================================
140 // Sample Subsetting / Subsampling
141 // =================================================================================================
142 
144  size_t max_depth,
145  SubsamplingMethod method
146 ) {
147  // Simply return a function object that applies the transformation.
148  switch( method ) {
150  return [ max_depth ]( Variant& variant ){
151  subscale_counts( variant, max_depth );
152  };
153  }
155  return [ max_depth ]( Variant& variant ){
156  subsample_counts_with_replacement( variant, max_depth );
157  };
158  }
160  return [ max_depth ]( Variant& variant ){
161  subsample_counts_without_replacement( variant, max_depth );
162  };
163  }
164  }
165 
166  throw std::invalid_argument(
167  "Invalid method provided for make_variant_input_stream_sample_subsampling_transform()"
168  );
169 }
170 
171 // =================================================================================================
172 // Observers
173 // =================================================================================================
174 
176  std::shared_ptr<genesis::sequence::SequenceDict> sequence_dict,
177  bool check_sequence_lengths
178 ) {
179  // We create copies of stuff here using lambda capture.
180  // Those will then live on once the lambda is in use.
181  GenomeLocus current_locus;
182  return [ sequence_dict, check_sequence_lengths, current_locus ]( Variant const& variant ) mutable {
183  // When a dict is provided, we need to take care of the special empty case for the chromosome
184  // name, which will happen in the very first time that this callback is used.
185  // The empty string won't be part of the dict chromosomes. For simplicity,
186  // we here always check that case. We later check that the variant itself does not have
187  // an empty chromosome name, in order to not regress to this starting condition.
188  if( ! current_locus.empty() ) {
189 
190  // If a dict is provided, but the chromosome is not in there, the following check throws.
191  if( ! locus_greater( variant.chromosome, variant.position, current_locus, sequence_dict )) {
192  throw std::runtime_error(
193  "Invalid sorting order of input Variants. By default, we expect lexicographical "
194  "sorting of chromosomes, and then sorting by position within chromosomes. "
195  "Alternatively, when a sequence dictionary is specified (such as from a .dict "
196  "or .fai file, or from a reference genome .fasta file), we expect the order of "
197  "chromosomes as specified there. "
198  "Offending input going from " + to_string( current_locus ) + " to " +
199  to_string( GenomeLocus{ variant.chromosome, variant.position })
200  );
201  }
202  }
203 
204  // Now also check the length, potentially.
205  if( check_sequence_lengths && sequence_dict ) {
206  auto const& entry = sequence_dict->get( variant.chromosome );
207  if( variant.position > entry.length ) {
208  throw std::runtime_error(
209  "The current position " +
210  to_string( GenomeLocus{ variant.chromosome, variant.position }) +
211  " of the input Variant is greater than the length of the chromosome "
212  "as specified by the SequenceDict, which is " + std::to_string( entry.length )
213  );
214  }
215  }
216 
217  // Finally, update the current locus according to the variant.
218  // We also check that the current variant is a valid, so that we do not run into
219  // the initial condition of an empty chromosome again.
220  current_locus = GenomeLocus( variant.chromosome, variant.position );
221  if( current_locus.chromosome.empty() || current_locus.position == 0 ) {
222  throw std::runtime_error(
223  "Invalid empty chromosome or position 0 found in input Variant."
224  );
225  }
226  };
227 }
228 
230  std::shared_ptr<genesis::sequence::SequenceDict> sequence_dict
231 ) {
232  return [ sequence_dict ]( Variant const& variant ) {
233  // Assumes the chromosome is in the dict. If not, the get function throws.
234  auto const& entry = sequence_dict->get( variant.chromosome );
235  if( variant.position > entry.length ) {
236  throw std::runtime_error(
237  "The current position " +
238  to_string( GenomeLocus{ variant.chromosome, variant.position }) +
239  " of the input Variant is greater than the length of the chromosome "
240  "as specified by the SequenceDict, which is " + std::to_string( entry.length )
241  );
242  }
243  };
244 }
245 
246 } // namespace population
247 } // namespace genesis
genesis::population::GenomeLocus::empty
bool empty() const
Definition: genome_locus.hpp:84
subsample.hpp
genesis::population::make_variant_input_stream_sequence_order_observer
std::function< void(Variant const &)> make_variant_input_stream_sequence_order_observer(std::shared_ptr< genesis::sequence::SequenceDict > sequence_dict={}, bool check_sequence_lengths=true)
Helper function to check that some Variant input is sorted properly.
Definition: function/variant_input_stream.hpp:202
genesis::population::SubsamplingMethod
SubsamplingMethod
Select which method to use for reducing the max read depth of a SampleCounts sample or a Variant.
Definition: function/variant_input_stream.hpp:99
genesis::population::GenomeLocus::position
size_t position
Definition: genome_locus.hpp:67
variant_input_stream.hpp
genesis::population::locus_greater
bool locus_greater(std::string const &l_chromosome, size_t l_position, std::string const &r_chromosome, size_t r_position)
Greater than comparison (>) for two loci in a genome.
Definition: function/genome_locus.hpp:465
genesis::population::GenomeLocus
A single locus, that is, a position (or coordinate) on a chromosome.
Definition: genome_locus.hpp:64
genesis::population::SubsamplingMethod::kSubsampleWithReplacement
@ kSubsampleWithReplacement
Use transform_subsample_with_replacement()
genesis::population::GenomeLocus::chromosome
std::string chromosome
Definition: genome_locus.hpp:66
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::population::make_variant_input_stream_sample_name_filter_transform
std::function< void(Variant &)> make_variant_input_stream_sample_name_filter_transform(std::vector< bool > const &sample_filter)
Helper function to create a Variant transform to filter out samples.
Definition: function/variant_input_stream.hpp:85
genesis::population::SubsamplingMethod::kSubsampleWithoutReplacement
@ kSubsampleWithoutReplacement
Use transform_subsample_without_replacement()
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis::population::make_variant_input_stream_sample_subsampling_transform
std::function< void(Variant &)> make_variant_input_stream_sample_subsampling_transform(size_t max_depth, SubsamplingMethod method=SubsamplingMethod::kSubscale)
Create a Variant transformation function that subscales or subsamples the base counts to be below a g...
Definition: function/variant_input_stream.hpp:134
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::subsample_counts_with_replacement
void subsample_counts_with_replacement(SampleCounts &sample, size_t max_depth)
Transform a SampleCounts sample by subsampling the nucleotide counts (A, C, G, T, as well as N and D)...
Definition: subsample.cpp:228
genesis::population::subsample_counts_without_replacement
void subsample_counts_without_replacement(SampleCounts &sample, size_t max_depth)
Transform a SampleCounts sample by subsampling the nucleotide counts (A, C, G, T, as well as N and D)...
Definition: subsample.cpp:268
genesis::population::subscale_counts
void subscale_counts(SampleCounts &sample, size_t max_depth)
Transform a SampleCounts sample by sub-scaling the base counts (A, C, G, T, as well as N and D) to su...
Definition: subsample.cpp:149
genesis::sequence::SequenceDict::get
Entry const & get(std::string const &name) const
Definition: sequence_dict.hpp:233
genesis::population::make_variant_input_stream_sequence_length_observer
std::function< void(Variant const &)> make_variant_input_stream_sequence_length_observer(std::shared_ptr< genesis::sequence::SequenceDict > sequence_dict)
Helper function to check that some Variant input has positions that agree with those reported in a Se...
Definition: function/variant_input_stream.hpp:217
genesis::population::SubsamplingMethod::kSubscale
@ kSubscale
Use transform_subscale()
genesis::population::make_sample_name_filter
std::vector< bool > make_sample_name_filter(std::vector< std::string > const &sample_names, std::vector< std::string > const &names_filter, bool inverse_filter)
Create a filter for samples, indicating which to keep.
Definition: variant_input_stream.cpp:46