A library for working with phylogenetic and population genetic data.
v0.27.0
variant_input_iterator.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cstdint>
41 #include <memory>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Local Helpers
49 // =================================================================================================
50 
61 template<class T, class R>
63  std::string const& filename,
64  R const& reader,
65  std::vector<size_t> const& sample_indices,
66  bool inverse_sample_indices,
67  std::vector<bool> const& sample_filter
68 ) {
69  // Prepare a reader. We switch depending on the type of filter.
70  // Not both can be given by the way that this function is called; assert that.
71  assert( sample_indices.empty() || sample_filter.empty() );
72 
73  std::shared_ptr<T> input;
74  if( ! sample_indices.empty() ) {
75 
76  // When we have indices given, we need to open the file once to get the number of samples
77  // in the file, then create our correctly sized bool vector, and then open the file again
78  // to start iterating with the filter. Cumbersome, but an unfortunate detail of the
79  // current implementation. Might need fixing later.
80  input = std::make_shared<T>( utils::from_file( filename ), reader );
81  auto const smp_cnt = (*input)->samples.size();
82 
83  // Make the filter. We check the condition that the make function checks here as well,
84  // as the error message is super not helpful for users otherwise. See there for details.
85  auto max_it = std::max_element( sample_indices.begin(), sample_indices.end() );
86  if( *max_it + 1 > smp_cnt ) {
87  throw std::invalid_argument(
88  "In " + filename + ": "
89  "Cannot create sample filter for the input file, as the filter index list contains "
90  "entries for " + std::to_string( *max_it + 1 ) + " samples, "
91  "while the input file only contains " + std::to_string( smp_cnt ) + " samples."
92  );
93  }
94 
95  // Now make a bool filter, inverse as needed, and restart the file with it.
96  auto smp_flt = utils::make_bool_vector_from_indices( sample_indices, smp_cnt );
97  if( inverse_sample_indices ) {
98  smp_flt.flip();
99  }
100  input = std::make_shared<T>(
101  utils::from_file( filename ),
102  smp_flt,
103  reader
104  );
105 
106  } else if( ! sample_filter.empty() ) {
107  input = std::make_shared<T>(
108  utils::from_file( filename ),
109  sample_filter,
110  reader
111  );
112  } else {
113  input = std::make_shared<T>(
114  utils::from_file( filename ),
115  reader
116  );
117  }
118  assert( input );
119  return input;
120 }
121 
122 // =================================================================================================
123 // SAM/BAM/CRAM
124 // =================================================================================================
125 
126 // Only available if compiled with htslib
127 #ifdef GENESIS_HTSLIB
128 
130  std::string const& filename,
131  SamVariantInputIterator const& reader
132 ) {
133  // Make an iterator over sam/bam/cram, using the given reader to take over its settings.
134  // We wrap this in a shared pointer so that this very instance can stay alive
135  // when being copied over to the lambda that we return from this function.
136  auto input = std::make_shared<SamVariantInputIterator>( reader );
137  input->input_file( filename );
138 
139  // Get the iterators. We store them by copy in the lambda, and these copies are kept alive
140  // when returning from this function.
141  assert( input );
142  auto cur = input->begin();
143  auto end = input->end();
144 
145  // Get the data, using the file base name without path and potential extensions as source.
147  data.file_path = filename;
148  data.source_name = utils::file_basename( filename, { ".sam", ".sam.gz", ".bam", ".cram" });
149 
150  // Get the sample names from the read group tags. If they are empty, because the reader
151  // was not set up to split by read group tags, we instead use an empty name, to indicate that
152  // there is one unnamed sample.
153  data.sample_names = cur.rg_tags();
154  if( data.sample_names.empty() ) {
155  // We could have an input file where we want to split by RG, but no RG are set in the
156  // header. When not using unaccounted RG, we would end up with no samples.
157  // Take this into account, and create as many empty (unnamed) samples as needed.
158  // This cannot be more than one though, as it can be the unaccounted or none,
159  // or, if we do not split by RG at all, just the one sample were every read ends up in.
160  for( size_t i = 0; i < cur.sample_size(); ++i ) {
161  data.sample_names.push_back( "" );
162  }
163  assert( data.sample_names.size() <= 1 );
164  } else {
165  assert( reader.split_by_rg() == true );
166  }
167 
168  // The input is copied over to the lambda, and that copy is kept alive
169  // when returning from this function.
170  return VariantInputIterator(
171  [ input, cur, end ]( Variant& variant ) mutable {
172  if( cur != end ) {
173  variant = std::move( *cur );
174  ++cur;
175  return true;
176  } else {
177  return false;
178  }
179  },
180  std::move( data )
181  );
182 }
183 
184 #endif // GENESIS_HTSLIB
185 
186 // =================================================================================================
187 // Pileup
188 // =================================================================================================
189 
194  std::string const& filename,
195  SimplePileupReader const& reader,
196  std::vector<size_t> const& sample_indices,
197  bool inverse_sample_indices,
198  std::vector<bool> const& sample_filter
199 ) {
200  // Get the input, taking care of the filters.
203  >(
204  filename, reader, sample_indices, inverse_sample_indices, sample_filter
205  );
206 
207  // Get the data, using the file base name without path and potential extensions as source.
209  data.file_path = filename;
210  data.source_name = utils::file_basename(
211  filename, { ".gz", ".plp", ".mplp", ".pileup", ".mpileup" }
212  );
213 
214  // No sample names in pileup...
215  // so we just fill with empty names to indicate the number of samples.
216  for( size_t i = 0; i < (*input)->samples.size(); ++i ) {
217  data.sample_names.push_back( "" );
218  }
219 
220  // The input is copied over to the lambda, and that copy is kept alive
221  // when returning from this function.
222  return VariantInputIterator(
223  [ input ]( Variant& variant ) mutable -> bool {
224  auto& it = *input;
225  if( it ) {
226  variant = std::move( *it );
227  ++it;
228  return true;
229  } else {
230  return false;
231  }
232  },
233  std::move( data )
234  );
235 }
236 
238  std::string const& filename,
239  SimplePileupReader const& reader
240 ) {
242  filename, reader, std::vector<size_t>{}, false, std::vector<bool>{}
243  );
244 }
245 
247  std::string const& filename,
248  std::vector<size_t> const& sample_indices,
249  bool inverse_sample_indices,
250  SimplePileupReader const& reader
251 ) {
253  filename, reader, sample_indices, inverse_sample_indices, std::vector<bool>{}
254  );
255 }
256 
258  std::string const& filename,
259  std::vector<bool> const& sample_filter,
260  SimplePileupReader const& reader
261 ) {
263  filename, reader, std::vector<size_t>{}, false, sample_filter
264  );
265 }
266 
267 // =================================================================================================
268 // Sync
269 // =================================================================================================
270 
272  std::string const& filename,
273  std::vector<size_t> const& sample_indices,
274  bool inverse_sample_indices,
275  std::vector<bool> const& sample_filter
276 ) {
277  // Get the input, taking care of the filters. We use a default reader here,
278  // as sync currently does not have any settings that a reader would neeed to take care of.
281  >(
282  filename, SyncReader(), sample_indices, inverse_sample_indices, sample_filter
283  );
284  // auto input = std::make_shared<SyncInputIterator>( utils::from_file( filename ));
285 
286  // Get the data, using the file base name without path and potential extensions as source.
288  data.file_path = filename;
289  data.source_name = utils::file_basename( filename, { ".gz", ".sync" });
290 
291  // No sample names in sync...
292  // so we just fill with empty names to indicate the number of samples.
293  for( size_t i = 0; i < (*input)->samples.size(); ++i ) {
294  data.sample_names.push_back( "" );
295  }
296 
297  // The input is copied over to the lambda, and that copy is kept alive
298  // when returning from this function.
299  return VariantInputIterator(
300  [ input ]( Variant& variant ) mutable {
301  auto& sync_it = *input;
302  if( sync_it ) {
303  variant = std::move( *sync_it );
304  ++sync_it;
305  return true;
306  } else {
307  return false;
308  }
309  },
310  std::move( data )
311  );
312 }
313 
315  std::string const& filename
316 ) {
318  filename, std::vector<size_t>{}, false, std::vector<bool>{}
319  );
320 }
321 
323  std::string const& filename,
324  std::vector<size_t> const& sample_indices,
325  bool inverse_sample_indices
326 ) {
328  filename, sample_indices, inverse_sample_indices, std::vector<bool>{}
329  );
330 }
331 
333  std::string const& filename,
334  std::vector<bool> const& sample_filter
335 ) {
337  filename, std::vector<size_t>{}, false, sample_filter
338  );
339 }
340 
341 // =================================================================================================
342 // VCF
343 // =================================================================================================
344 
345 // Only available if compiled with htslib
346 #ifdef GENESIS_HTSLIB
347 
352  // File input
353  std::string const& filename,
354  std::vector<std::string> const& sample_names,
355  bool inverse_sample_names,
356 
357  // Settings
358  bool pool_samples,
359  bool use_allelic_depth,
360  bool only_biallelic,
361  bool only_filter_pass
362 ) {
363  // Make an iterator over vcf, and check that the necessary format field AD is present
364  // and of the correct form. We wrap this in a shared pointer so that this very instance
365  // can stay alive when being copied over to the lambda that we return from this function.
366  auto input = std::make_shared<VcfInputIterator>( filename, sample_names, inverse_sample_names );
367  if(
368  use_allelic_depth &&
369  ! input->header().has_format( "AD", VcfValueType::kInteger, VcfValueSpecial::kReference )
370  ) {
371  throw std::runtime_error(
372  "Cannot iterate over VCF file " + filename + " using the \"AD\" FORMAT " +
373  "field to count allelic depths, as that field is not part of the VCF file."
374  );
375  }
376 
377  // Get the data, using the file base name without path and potential extensions as source.
379  data.file_path = filename;
380  data.source_name = utils::file_basename( filename, { ".gz", ".vcf", ".bcf" });
381  data.sample_names = input->header().get_sample_names();
382 
383  // The input is copied over to the lambda, and that copy is kept alive
384  // when returning from this function.
385  return VariantInputIterator(
386  [ input, pool_samples, use_allelic_depth, only_biallelic, only_filter_pass ]
387  ( Variant& variant ) mutable {
388  auto& vcf_it = *input;
389 
390  // Only use the lines that have the "AD" field, and are biallelic.
391  // Also test for the extra conditions. If any test fails, skip this position.
392  // For simplicity, we code that as a sequence of conditions; the compiler will optimize.
393  for( ; vcf_it; ++vcf_it ) {
394  if( ! vcf_it->has_format( "AD" ) || ! vcf_it->is_snp() ) {
395  continue;
396  }
397  if( only_biallelic && vcf_it->get_alternatives_count() != 1 ) {
398  continue;
399  }
400  if( only_filter_pass && ! vcf_it->pass_filter() ) {
401  continue;
402  }
403  break;
404  }
405 
406  // Now we are either at a record that fits our needs, or at the end of the input.
407  if( vcf_it ) {
408  assert( vcf_it->has_format( "AD" ) );
409  assert( vcf_it->is_snp() );
410 
411  // Depending on what type of conversion we want to do (which in turn depends on
412  // the wrapper function that this local function was called from), we switch
413  // between pools and individuals here.
414  if( pool_samples ) {
415  variant = convert_to_variant_as_pool( *vcf_it );
416  } else {
417  variant = convert_to_variant_as_individuals( *vcf_it, use_allelic_depth );
418  }
419 
420  // Move on to the next input, so that it is ready when this lambda is called again.
421  ++vcf_it;
422  return true;
423  } else {
424  // If we reached the end of the input, return false to signal this.
425  return false;
426  }
427  },
428  std::move( data )
429  );
430 }
431 
433  std::string const& filename,
434  bool only_biallelic,
435  bool only_filter_pass
436 ) {
438  filename, std::vector<std::string>{}, false,
439  only_biallelic, only_filter_pass
440  );
441 }
442 
444  std::string const& filename,
445  std::vector<std::string> const& sample_names,
446  bool inverse_sample_names,
447  bool only_biallelic,
448  bool only_filter_pass
449 ) {
451  filename, sample_names, inverse_sample_names,
452  true, true, only_biallelic, only_filter_pass
453  );
454 }
455 
457  std::string const& filename,
458  bool use_allelic_depth,
459  bool only_biallelic,
460  bool only_filter_pass
461 ) {
463  filename, std::vector<std::string>{}, false,
464  use_allelic_depth, only_biallelic, only_filter_pass
465  );
466 }
467 
469  std::string const& filename,
470  std::vector<std::string> const& sample_names,
471  bool inverse_sample_names,
472  bool use_allelic_depth,
473  bool only_biallelic,
474  bool only_filter_pass
475 ) {
477  filename, sample_names, inverse_sample_names,
478  false, use_allelic_depth, only_biallelic, only_filter_pass
479  );
480 }
481 
482 #endif // GENESIS_HTSLIB
483 
484 // =================================================================================================
485 // Variant Parallel Input Iterator
486 // =================================================================================================
487 
489  VariantParallelInputIterator const& parallel_input,
490  bool allow_ref_base_mismatches,
491  bool allow_alt_base_mismatches,
492  std::string const& source_sample_separator
493 ) {
494  // As before, make a shared pointer (with a copy of the input) that stays alive.
495  auto input = std::make_shared<VariantParallelInputIterator>( parallel_input );
496 
497  // Get the iterators.
498  assert( input );
499  auto cur = input->begin();
500  auto end = input->end();
501 
502  // We do not have a single file here, so let's put the file names into the sample names...
503  // for now at least. Might change the interface in the future to better accommodate for ath.
504  // Leave file_path and source_name at their empty defaults.
506  for( auto const& source : input->inputs() ) {
507  auto const& source_name = source.data().source_name;
508  for( auto const& sample_name : source.data().sample_names ) {
509  data.sample_names.push_back( source_name + source_sample_separator + sample_name );
510  }
511  }
512 
513  // The input is copied over to the lambda, and that copy is kept alive.
514  return VariantInputIterator(
515  [ input, cur, end, allow_ref_base_mismatches, allow_alt_base_mismatches ]
516  ( Variant& variant ) mutable {
517  if( cur != end ) {
518  variant = cur.joined_variant(
519  allow_ref_base_mismatches, allow_alt_base_mismatches, true
520  );
521  ++cur;
522  return true;
523  } else {
524  return false;
525  }
526  },
527  std::move( data )
528  );
529 }
530 
531 } // namespace population
532 } // namespace genesis
genesis::population::SamVariantInputIterator
Input iterator for SAM/BAM/CRAM files that produces a Variant per genome position.
Definition: sam_variant_input_iterator.hpp:102
genesis::utils::LambdaIterator
Type erasure for iterators, using std::function to eliminate the underlying input type.
Definition: lambda_iterator.hpp:150
genesis::population::VariantInputIteratorData::file_path
std::string file_path
Full file path, when reading from a file.
Definition: variant_input_iterator.hpp:85
fs.hpp
Provides functions for accessing the file system.
helper.hpp
genesis::population::SimplePileupReader
Reader for line-by-line assessment of (m)pileup files.
Definition: simple_pileup_reader.hpp:77
genesis::population::VariantParallelInputIterator
Iterate multiple input sources that yield Variants in parallel.
Definition: variant_parallel_input_iterator.hpp:131
genesis::utils::from_file
std::shared_ptr< BaseInputSource > from_file(std::string const &file_name, bool detect_compression=true)
Obtain an input source for reading from a file.
Definition: input_source.hpp:67
genesis::population::make_input_iterator_with_sample_filter_
std::shared_ptr< T > make_input_iterator_with_sample_filter_(std::string const &filename, R const &reader, std::vector< size_t > const &sample_indices, bool inverse_sample_indices, std::vector< bool > const &sample_filter)
Local helper function template that takes care of intilizing an input iterator, and setting the sampl...
Definition: variant_input_iterator.cpp:62
genesis::population::convert_to_variant_as_individuals
Variant convert_to_variant_as_individuals(VcfRecord const &record, bool use_allelic_depth)
Convert a VcfRecord to a Variant, treating each sample as an individual, and combining them all into ...
Definition: vcf_common.cpp:381
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
genesis::population::VcfValueSpecial::kReference
@ kReference
genesis::population::make_variant_input_iterator_from_sync_file
VariantInputIterator make_variant_input_iterator_from_sync_file(std::string const &filename)
Create a VariantInputIterator to iterate the contents of a PoPoolation2 sync file as Variants.
Definition: variant_input_iterator.cpp:314
genesis::population::make_variant_input_iterator_from_vcf_file_
VariantInputIterator make_variant_input_iterator_from_vcf_file_(std::string const &filename, std::vector< std::string > const &sample_names, bool inverse_sample_names, bool pool_samples, bool use_allelic_depth, bool only_biallelic, bool only_filter_pass)
Local helper function that takes care of both main functions below.
Definition: variant_input_iterator.cpp:351
genesis::population::VariantInputIteratorData::source_name
std::string source_name
User-readable name of the input source.
Definition: variant_input_iterator.hpp:92
genesis::population::SimplePileupInputIterator
Iterate an input source and parse it as a (m)pileup file.
Definition: simple_pileup_input_iterator.hpp:79
genesis::population::VcfValueType::kInteger
@ kInteger
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::make_variant_input_iterator_from_pileup_file_
VariantInputIterator make_variant_input_iterator_from_pileup_file_(std::string const &filename, SimplePileupReader const &reader, std::vector< size_t > const &sample_indices, bool inverse_sample_indices, std::vector< bool > const &sample_filter)
Local helper function that takes care of the three functions below.
Definition: variant_input_iterator.cpp:193
genesis::population::VariantInputIteratorData::sample_names
std::vector< std::string > sample_names
Sample names, for example as found in the file header.
Definition: variant_input_iterator.hpp:101
variant_input_iterator.hpp
genesis::population::make_variant_input_iterator_from_individual_vcf_file
VariantInputIterator make_variant_input_iterator_from_individual_vcf_file(std::string const &filename, bool use_allelic_depth, bool only_biallelic, bool only_filter_pass)
Create a VariantInputIterator to iterate the contents of a VCF file as Variants, treating each sample...
Definition: variant_input_iterator.cpp:456
genesis::population::SyncInputIterator
Iterate an input source and parse it as a sync file.
Definition: sync_input_iterator.hpp:74
genesis::population::make_variant_input_iterator_from_sam_file
VariantInputIterator make_variant_input_iterator_from_sam_file(std::string const &filename, SamVariantInputIterator const &reader)
Create a VariantInputIterator to iterate the contents of a SAM/BAM/CRAM file as Variants.
Definition: variant_input_iterator.cpp:129
variant_parallel_input_iterator.hpp
genesis::utils::file_basename
std::string file_basename(std::string const &filename)
Remove directory name from file name if present.
Definition: fs.cpp:687
genesis::utils::make_bool_vector_from_indices
std::vector< bool > make_bool_vector_from_indices(std::vector< size_t > const &indices, size_t size)
Helper function to create a bool vector from a set of indices to be set to true.
Definition: utils/math/bitvector/helper.cpp:45
genesis::population::VariantInputIterator
utils::LambdaIterator< Variant, VariantInputIteratorData > VariantInputIterator
Iterate Variants, using a variety of input file formats.
Definition: variant_input_iterator.hpp:124
genesis::population::SyncReader
Reader for PoPoolation2's "synchronized" files.
Definition: sync_reader.hpp:62
genesis::population::make_variant_input_iterator_from_sync_file_
VariantInputIterator make_variant_input_iterator_from_sync_file_(std::string const &filename, std::vector< size_t > const &sample_indices, bool inverse_sample_indices, std::vector< bool > const &sample_filter)
Definition: variant_input_iterator.cpp:271
genesis::population::make_variant_input_iterator_from_pool_vcf_file
VariantInputIterator make_variant_input_iterator_from_pool_vcf_file(std::string const &filename, bool only_biallelic, bool only_filter_pass)
Create a VariantInputIterator to iterate the contents of a VCF file as Variants, treating each sample...
Definition: variant_input_iterator.cpp:432
functions.hpp
genesis::population::convert_to_variant_as_pool
Variant convert_to_variant_as_pool(VcfRecord const &record)
Convert a VcfRecord to a Variant, treating each sample column as a pool of individuals.
Definition: vcf_common.cpp:275
genesis::population::make_variant_input_iterator_from_pileup_file
VariantInputIterator make_variant_input_iterator_from_pileup_file(std::string const &filename, SimplePileupReader const &reader)
Create a VariantInputIterator to iterate the contents of a (m)pileup file as Variants.
Definition: variant_input_iterator.cpp:237
genesis::population::make_variant_input_iterator_from_variant_parallel_input_iterator
VariantInputIterator make_variant_input_iterator_from_variant_parallel_input_iterator(VariantParallelInputIterator const &parallel_input, bool allow_ref_base_mismatches, bool allow_alt_base_mismatches, std::string const &source_sample_separator)
Create a VariantInputIterator to iterate multiple input sources at once, using a VariantParallelInput...
Definition: variant_input_iterator.cpp:488
genesis::population::SamVariantInputIterator::split_by_rg
bool split_by_rg() const
Definition: sam_variant_input_iterator.hpp:634
genesis::population::VariantInputIteratorData
Data storage for input-specific information when traversing a variant file.
Definition: variant_input_iterator.hpp:80