A library for working with phylogenetic and population genetic data.
v0.27.0
sam_variant_input_iterator.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMATS_SAM_VARIANT_INPUT_ITERATOR_H_
2 #define GENESIS_POPULATION_FORMATS_SAM_VARIANT_INPUT_ITERATOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #ifdef GENESIS_HTSLIB
35 
42 
43 #include <cstdint>
44 #include <memory>
45 #include <string>
46 #include <unordered_map>
47 #include <unordered_set>
48 #include <vector>
49 
50 // Forward declarations of htslib structs, so that we do not have to include their headers
51 // and completely spam our namespace here.
52 extern "C" {
53 
54  struct htsFile;
55  struct hts_itr_t;
56  struct sam_hdr_t;
57  struct bam1_t;
58 
59  struct bam_plp_s;
60  typedef struct bam_plp_s *bam_plp_t;
61 
62  struct bam_pileup1_t;
63  // typedef struct bam_pileup1_t *bam_pileup1_t;
64 
65 }
66 
67 namespace genesis {
68 namespace population {
69 
70 // =================================================================================================
71 // SAM/BAM/CRAM File Input iterator
72 // =================================================================================================
73 
103 {
104 public:
105 
106  // -------------------------------------------------------------------------
107  // Member Types
108  // -------------------------------------------------------------------------
109 
112  using pointer = value_type const*;
113  using reference = value_type const&;
114  using difference_type = std::ptrdiff_t;
115  using iterator_category = std::input_iterator_tag;
116 
117  // ======================================================================================
118  // Sam File Handle
119  // ======================================================================================
120 
121 private:
122 
141  struct SamFileHandle;
142 
143  // ======================================================================================
144  // Internal Iterator
145  // ======================================================================================
146 
147 public:
148 
156  class Iterator
157  {
158  public:
159 
160  // -------------------------------------------------------------------------
161  // Constructors and Rule of Five
162  // -------------------------------------------------------------------------
163 
166  using pointer = value_type const*;
167  using reference = value_type const&;
168  using iterator_category = std::input_iterator_tag;
169 
170  private:
171 
172  Iterator() = default;
173  Iterator( SamVariantInputIterator const* parent );
174 
175  public:
176 
177  ~Iterator() = default;
178 
179  Iterator( self_type const& ) = default;
180  Iterator( self_type&& ) = default;
181 
182  Iterator& operator= ( self_type const& ) = default;
183  Iterator& operator= ( self_type&& ) = default;
184 
187 
188  // -------------------------------------------------------------------------
189  // Accessors
190  // -------------------------------------------------------------------------
191 
192  value_type const * operator->() const
193  {
194  return &current_variant_;
195  }
196 
198  {
199  return &current_variant_;
200  }
201 
202  value_type const & operator*() const
203  {
204  return current_variant_;
205  }
206 
208  {
209  return current_variant_;
210  }
211 
212  // -------------------------------------------------------------------------
213  // Iteration
214  // -------------------------------------------------------------------------
215 
217  {
218  increment_();
219  return *this;
220  }
221 
222  // self_type operator ++(int)
223  // {
224  // auto cpy = *this;
225  // increment_();
226  // return cpy;
227  // }
228 
236  bool operator==( self_type const& it ) const
237  {
238  return parent_ == it.parent_;
239  }
240 
241  bool operator!=( self_type const& it ) const
242  {
243  return !(*this == it);
244  }
245 
246  // -------------------------------------------------------------------------
247  // Data Access
248  // -------------------------------------------------------------------------
249 
282  std::vector<std::string> rg_tags( bool all_header_tags = false ) const;
283 
288  size_t sample_size() const;
289 
290  // -------------------------------------------------------------------------
291  // Internal Members
292  // -------------------------------------------------------------------------
293 
294  private:
295 
299  void increment_();
300 
308  void process_base_( bam_pileup1_t const* p );
309 
317  size_t get_sample_index_( bam_pileup1_t const* p ) const;
318 
319  private:
320 
321  // Parent.
322  SamVariantInputIterator const* parent_ = nullptr;
323 
324  // htslib specific file handling pointers during iteration.
325  // We put this in a shared ptr so that the iterator can be copied,
326  // but the htslib data gets only freed once all instances are done.
327  // This also allows PIMPL to avoid including the htslib headers here.
328  std::shared_ptr<SamFileHandle> handle_;
329 
330  // Current variant object, keeping the base tally of the current locus.
331  Variant current_variant_;
332 
333  };
334 
335  // ======================================================================================
336  // Main Class
337  // ======================================================================================
338 
339  // -------------------------------------------------------------------------
340  // Constructors and Rule of Five
341  // -------------------------------------------------------------------------
342 
347  : SamVariantInputIterator( std::string{} )
348  {}
349 
351  std::string const& input_file
352  )
353  : SamVariantInputIterator( input_file, std::unordered_set<std::string>{}, false )
354  {}
355 
357  std::string const& input_file,
358  std::unordered_set<std::string> const& rg_tag_filter,
359  bool inverse_rg_tag_filter = false
360  );
361 
362  ~SamVariantInputIterator() = default;
363 
366 
369 
370  // -------------------------------------------------------------------------
371  // Iteration
372  // -------------------------------------------------------------------------
373 
374  Iterator begin() const
375  {
376  // Better error messaging that what htslib would give us if the file did not exist.
377  // We check here, as input_file() allows to change the file after construction,
378  // so we only do the check once we know that we are good to go.
379  if( ! utils::is_file( input_file_ )) {
380  throw std::runtime_error( "Input sam/bam/cram file not found: " + input_file_ );
381  }
382 
383  return Iterator( this );
384  }
385 
386  Iterator end() const
387  {
388  return Iterator();
389  }
390 
391  // -------------------------------------------------------------------------
392  // Basic Input Settings
393  // -------------------------------------------------------------------------
394 
395  std::string const& input_file() const
396  {
397  return input_file_;
398  }
399 
406  self_type& input_file( std::string const& value )
407  {
408  input_file_ = value;
409  return *this;
410  }
411 
412  // -------------------------------------------------------------------------
413  // Flag Settings
414  // -------------------------------------------------------------------------
415 
416  uint32_t flags_include_all() const
417  {
418  return flags_include_all_;
419  }
420 
439  self_type& flags_include_all( uint32_t value )
440  {
441  flags_include_all_ = value;
442  return *this;
443  }
444 
445  uint32_t flags_include_any() const
446  {
447  return flags_include_any_;
448  }
449 
468  self_type& flags_include_any( uint32_t value )
469  {
470  flags_include_any_ = value;
471  return *this;
472  }
473 
474  uint32_t flags_exclude_all() const
475  {
476  return flags_exclude_all_;
477  }
478 
497  self_type& flags_exclude_all( uint32_t value )
498  {
499 
500  flags_exclude_all_ = value;
501  return *this;
502  }
503 
504  uint32_t flags_exclude_any() const
505  {
506  return flags_exclude_any_;
507  }
508 
527  self_type& flags_exclude_any( uint32_t value )
528  {
529  flags_exclude_any_ = value;
530  return *this;
531  }
532 
533  // -------------------------------------------------------------------------
534  // Quality Settings
535  // -------------------------------------------------------------------------
536 
537  int min_map_qual() const
538  {
539  return min_map_qual_;
540  }
541 
549  self_type& min_map_qual( int value )
550  {
551  min_map_qual_ = value;
552  return *this;
553  }
554 
555  int min_base_qual() const
556  {
557  return min_base_qual_;
558  }
559 
566  self_type& min_base_qual( int value )
567  {
568  min_base_qual_ = value;
569  return *this;
570  }
571 
572  // -------------------------------------------------------------------------
573  // Depth Settings
574  // -------------------------------------------------------------------------
575 
576  int min_depth() const
577  {
578  return min_depth_;
579  }
580 
586  self_type& min_depth( int value )
587  {
588  min_depth_ = value;
589  return *this;
590  }
591 
592  int max_depth() const
593  {
594  return max_depth_;
595  }
596 
603  self_type& max_depth( int value )
604  {
605  max_depth_ = value;
606  return *this;
607  }
608 
610  {
611  return max_acc_depth_;
612  }
613 
625  {
626  max_acc_depth_ = value;
627  return *this;
628  }
629 
630  // -------------------------------------------------------------------------
631  // Read Group Settings
632  // -------------------------------------------------------------------------
633 
634  bool split_by_rg() const
635  {
636  return split_by_rg_;
637  }
638 
647  self_type& split_by_rg( bool value )
648  {
649  split_by_rg_ = value;
650  return *this;
651  }
652 
653  bool with_unaccounted_rg() const
654  {
655  return with_unaccounted_rg_;
656  }
657 
672  {
673  with_unaccounted_rg_ = value;
674  return *this;
675  }
676 
677  std::unordered_set<std::string> const& rg_tag_filter() const
678  {
679  return rg_tag_filter_;
680  }
681 
699  self_type& rg_tag_filter( std::unordered_set<std::string> const& value )
700  {
701  rg_tag_filter_ = value;
702  return *this;
703  }
704 
706  {
707  return inverse_rg_tag_filter_;
708  }
709 
716  {
717  inverse_rg_tag_filter_ = value;
718  return *this;
719  }
720 
721  // bool skip_empty_positions() const
722  // {
723  // return skip_empty_positions_;
724  // }
725  //
726  // self_type& skip_empty_positions( bool value )
727  // {
728  // skip_empty_positions_ = value;
729  // return *this;
730  // }
731 
732  // -------------------------------------------------------------------------
733  // Data Members
734  // -------------------------------------------------------------------------
735 
736 private:
737 
738  // Input data
739  std::string input_file_;
740 
741  // Settings
742 
743  // Read filtering flags, as used by htslib and samtools
744  uint32_t flags_include_all_ = 0;
745  uint32_t flags_include_any_ = 0;
746  uint32_t flags_exclude_all_ = 0;
747  uint32_t flags_exclude_any_ = 0;
748 
749  // Minimum mapping and base qualities
750  int min_map_qual_ = 0;
751  int min_base_qual_ = 0;
752 
753  // Read depth / coverage
754  int min_depth_ = 0;
755  int max_depth_ = 0;
756  int max_acc_depth_ = 0;
757 
758  bool split_by_rg_ = false;
759  bool with_unaccounted_rg_ = false;
760  std::unordered_set<std::string> rg_tag_filter_;
761  bool inverse_rg_tag_filter_ = false;
762 
763  // Unused / need to investiage if needed
764  // int max_indel_depth_ = 250;
765  // bool skip_empty_positions_ = true;
766  // bool detect_overlapping_read_pairs_ = false;
767 
768 };
769 
770 } // namespace population
771 } // namespace genesis
772 
773 #endif // htslib guard
774 #endif // include guard
genesis::population::SamVariantInputIterator
Input iterator for SAM/BAM/CRAM files that produces a Variant per genome position.
Definition: sam_variant_input_iterator.hpp:102
genesis::population::SamVariantInputIterator::min_depth
int min_depth() const
Definition: sam_variant_input_iterator.hpp:576
genesis::population::SamVariantInputIterator::rg_tag_filter
self_type & rg_tag_filter(std::unordered_set< std::string > const &value)
Set the sample names used for filtering reads by their RG read group tag.
Definition: sam_variant_input_iterator.hpp:699
base_counts.hpp
genesis::population::SamVariantInputIterator::difference_type
std::ptrdiff_t difference_type
Definition: sam_variant_input_iterator.hpp:114
genesis::population::SamVariantInputIterator::~SamVariantInputIterator
~SamVariantInputIterator()=default
genesis::population::SamVariantInputIterator::end
Iterator end() const
Definition: sam_variant_input_iterator.hpp:386
fs.hpp
Provides functions for accessing the file system.
genesis::population::SamVariantInputIterator::input_file
self_type & input_file(std::string const &value)
Set the input file.
Definition: sam_variant_input_iterator.hpp:406
genesis::population::SamVariantInputIterator::Iterator::iterator_category
std::input_iterator_tag iterator_category
Definition: sam_variant_input_iterator.hpp:168
genesis::population::SamVariantInputIterator::min_base_qual
int min_base_qual() const
Definition: sam_variant_input_iterator.hpp:555
genesis::population::SamVariantInputIterator::iterator_category
std::input_iterator_tag iterator_category
Definition: sam_variant_input_iterator.hpp:115
genesis::population::SamVariantInputIterator::Iterator::operator++
self_type & operator++()
Definition: sam_variant_input_iterator.hpp:216
genome_locus.hpp
genesis::population::SamVariantInputIterator::Iterator
Iterator over loci of the input sources.
Definition: sam_variant_input_iterator.hpp:156
genesis::population::SamVariantInputIterator::Iterator::reference
value_type const & reference
Definition: sam_variant_input_iterator.hpp:167
genesis::population::SamVariantInputIterator::flags_include_all
self_type & flags_include_all(uint32_t value)
Only use reads with all bits set in value present in the FLAG field of the read.
Definition: sam_variant_input_iterator.hpp:439
genesis::population::SamVariantInputIterator::min_map_qual
int min_map_qual() const
Definition: sam_variant_input_iterator.hpp:537
genesis::population::SamVariantInputIterator::max_accumulation_depth
self_type & max_accumulation_depth(int value)
Set the maximum depth (coverage) at a given position that is actually processed.
Definition: sam_variant_input_iterator.hpp:624
genesis::population::SamVariantInputIterator::Iterator::operator->
value_type * operator->()
Definition: sam_variant_input_iterator.hpp:197
genesis::population::SamVariantInputIterator::inverse_rg_tag_filter
bool inverse_rg_tag_filter() const
Definition: sam_variant_input_iterator.hpp:705
genesis::population::SamVariantInputIterator::input_file
std::string const & input_file() const
Definition: sam_variant_input_iterator.hpp:395
genesis::population::SamVariantInputIterator::flags_include_all
uint32_t flags_include_all() const
Definition: sam_variant_input_iterator.hpp:416
genome_locus.hpp
genesis::population::SamVariantInputIterator::rg_tag_filter
std::unordered_set< std::string > const & rg_tag_filter() const
Definition: sam_variant_input_iterator.hpp:677
genesis::population::SamVariantInputIterator::with_unaccounted_rg
bool with_unaccounted_rg() const
Definition: sam_variant_input_iterator.hpp:653
genesis::population::SamVariantInputIterator::Iterator::SamVariantInputIterator
friend SamVariantInputIterator
Definition: sam_variant_input_iterator.hpp:185
genesis::population::SamVariantInputIterator::reference
value_type const & reference
Definition: sam_variant_input_iterator.hpp:113
genesis::population::SamVariantInputIterator::min_base_qual
self_type & min_base_qual(int value)
Set the minimum phred-scaled per-base quality score for a nucleotide to be considered.
Definition: sam_variant_input_iterator.hpp:566
genesis::population::SamVariantInputIterator::SamVariantInputIterator
SamVariantInputIterator(std::string const &input_file)
Definition: sam_variant_input_iterator.hpp:350
genesis::population::SamVariantInputIterator::Iterator::sample_size
size_t sample_size() const
Return the size of the Variant::sample vector of BaseCounts that is produced by the iterator.
Definition: sam_variant_input_iterator.cpp:732
genesis::population::SamVariantInputIterator::Iterator::operator*
const value_type & operator*() const
Definition: sam_variant_input_iterator.hpp:202
genesis::population::SamVariantInputIterator::pointer
value_type const * pointer
Definition: sam_variant_input_iterator.hpp:112
genesis::population::SamVariantInputIterator::Iterator::rg_tags
std::vector< std::string > rg_tags(bool all_header_tags=false) const
Get the list of read group RG tags as used in the iteration, or as found in the SAM/BAM/CRAM file.
Definition: sam_variant_input_iterator.cpp:698
genesis::population::SamVariantInputIterator::min_map_qual
self_type & min_map_qual(int value)
Set the minimum phred-scaled mapping quality score for a read in the input file to be considered.
Definition: sam_variant_input_iterator.hpp:549
genesis::population::SamVariantInputIterator::Iterator::SamFileHandle
friend SamFileHandle
Definition: sam_variant_input_iterator.hpp:186
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::SamVariantInputIterator::flags_exclude_all
self_type & flags_exclude_all(uint32_t value)
Do not use reads with all bits set in value present in the FLAG field of the read.
Definition: sam_variant_input_iterator.hpp:497
genesis::population::SamVariantInputIterator::min_depth
self_type & min_depth(int value)
Set the minimum depth (coverage) at a given position to be considered.
Definition: sam_variant_input_iterator.hpp:586
genesis::population::SamVariantInputIterator::Iterator::operator->
const value_type * operator->() const
Definition: sam_variant_input_iterator.hpp:192
genesis::population::SamVariantInputIterator::flags_exclude_any
self_type & flags_exclude_any(uint32_t value)
Do not use reads with any bits set in value present in the FLAG field of the read.
Definition: sam_variant_input_iterator.hpp:527
genesis::population::SamVariantInputIterator::max_depth
self_type & max_depth(int value)
Set the maximum depth (coverage) at a given position to be considered.
Definition: sam_variant_input_iterator.hpp:603
genesis::population::SamVariantInputIterator::flags_exclude_all
uint32_t flags_exclude_all() const
Definition: sam_variant_input_iterator.hpp:474
genesis::population::SamVariantInputIterator::Iterator::operator*
value_type & operator*()
Definition: sam_variant_input_iterator.hpp:207
genesis::utils::is_file
bool is_file(std::string const &path)
Return true iff the provided path is a file.
Definition: fs.cpp:73
genesis::population::SamVariantInputIterator::flags_include_any
uint32_t flags_include_any() const
Definition: sam_variant_input_iterator.hpp:445
genesis::population::SamVariantInputIterator::Iterator::pointer
value_type const * pointer
Definition: sam_variant_input_iterator.hpp:166
genesis::population::SamVariantInputIterator::max_accumulation_depth
int max_accumulation_depth() const
Definition: sam_variant_input_iterator.hpp:609
genesis::population::SamVariantInputIterator::split_by_rg
self_type & split_by_rg(bool value)
If set to true, instead of reading all mapped reads as a single sample, split them by the @RG read gr...
Definition: sam_variant_input_iterator.hpp:647
genesis::population::SamVariantInputIterator::with_unaccounted_rg
self_type & with_unaccounted_rg(bool value)
Decide whether to add a sample for reads without a read group, when splitting by @RG tag.
Definition: sam_variant_input_iterator.hpp:671
bam_plp_t
struct bam_plp_s * bam_plp_t
Definition: sam_variant_input_iterator.hpp:60
variant.hpp
genesis::population::SamVariantInputIterator::inverse_rg_tag_filter
self_type & inverse_rg_tag_filter(bool value)
Reverse the meaning of the list of sample names given by rg_tag_filter().
Definition: sam_variant_input_iterator.hpp:715
genesis::population::SamVariantInputIterator::operator=
SamVariantInputIterator & operator=(SamVariantInputIterator const &)=default
genesis::population::SamVariantInputIterator::Iterator::operator!=
bool operator!=(self_type const &it) const
Definition: sam_variant_input_iterator.hpp:241
genesis::population::SamVariantInputIterator::Iterator::operator=
Iterator & operator=(self_type const &)=default
genesis::population::SamVariantInputIterator::flags_exclude_any
uint32_t flags_exclude_any() const
Definition: sam_variant_input_iterator.hpp:504
genesis::population::SamVariantInputIterator::flags_include_any
self_type & flags_include_any(uint32_t value)
Only use reads with any bits set in value present in the FLAG field of the read.
Definition: sam_variant_input_iterator.hpp:468
genesis::population::SamVariantInputIterator::begin
Iterator begin() const
Definition: sam_variant_input_iterator.hpp:374
genesis::population::SamVariantInputIterator::Iterator::operator==
bool operator==(self_type const &it) const
Compare two iterators for equality.
Definition: sam_variant_input_iterator.hpp:236
genesis::population::SamVariantInputIterator::max_depth
int max_depth() const
Definition: sam_variant_input_iterator.hpp:592
genesis::population::SamVariantInputIterator::SamVariantInputIterator
SamVariantInputIterator()
Create a default instance, with no input. This is also the past-the-end iterator.
Definition: sam_variant_input_iterator.hpp:346
genesis::population::SamVariantInputIterator::Iterator::~Iterator
~Iterator()=default
genesis::population::SamVariantInputIterator::split_by_rg
bool split_by_rg() const
Definition: sam_variant_input_iterator.hpp:634