A library for working with phylogenetic and population genetic data.
v0.32.0
sam_variant_input_stream.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMAT_SAM_VARIANT_INPUT_STREAM_H_
2 #define GENESIS_POPULATION_FORMAT_SAM_VARIANT_INPUT_STREAM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
34 #ifdef GENESIS_HTSLIB
35 
43 
44 #include <cstdint>
45 #include <memory>
46 #include <string>
47 #include <unordered_map>
48 #include <unordered_set>
49 #include <vector>
50 
51 // Forward declarations of htslib structs, so that we do not have to include their headers
52 // and completely spam our namespace here.
53 extern "C" {
54 
55  struct htsFile;
56  struct hts_itr_t;
57  struct sam_hdr_t;
58  struct bam1_t;
59 
60  struct bam_plp_s;
61  typedef struct bam_plp_s *bam_plp_t;
62 
63  struct bam_pileup1_t;
64  // typedef struct bam_pileup1_t *bam_pileup1_t;
65 
66 }
67 
68 namespace genesis {
69 namespace population {
70 
71 // =================================================================================================
72 // SAM/BAM/CRAM File Input Stream
73 // =================================================================================================
74 
104 {
105 public:
106 
107  // -------------------------------------------------------------------------
108  // Member Types
109  // -------------------------------------------------------------------------
110 
113  using pointer = value_type const*;
114  using reference = value_type const&;
115  using difference_type = std::ptrdiff_t;
116  using iterator_category = std::input_iterator_tag;
117 
118  // ======================================================================================
119  // Sam File Handle
120  // ======================================================================================
121 
122 private:
123 
142  struct SamFileHandle;
143 
144  // ======================================================================================
145  // Internal Iterator
146  // ======================================================================================
147 
148 public:
149 
157  class Iterator
158  {
159  public:
160 
161  // -------------------------------------------------------------------------
162  // Constructors and Rule of Five
163  // -------------------------------------------------------------------------
164 
167  using pointer = value_type const*;
168  using reference = value_type const&;
169  using iterator_category = std::input_iterator_tag;
170 
171  private:
172 
173  Iterator() = default;
174  Iterator( SamVariantInputStream const* parent );
175 
176  public:
177 
178  ~Iterator() = default;
179 
180  Iterator( self_type const& ) = default;
181  Iterator( self_type&& ) = default;
182 
183  Iterator& operator= ( self_type const& ) = default;
184  Iterator& operator= ( self_type&& ) = default;
185 
188 
189  // -------------------------------------------------------------------------
190  // Accessors
191  // -------------------------------------------------------------------------
192 
193  value_type const * operator->() const
194  {
195  return &current_variant_;
196  }
197 
199  {
200  return &current_variant_;
201  }
202 
203  value_type const & operator*() const
204  {
205  return current_variant_;
206  }
207 
209  {
210  return current_variant_;
211  }
212 
213  // -------------------------------------------------------------------------
214  // Iteration
215  // -------------------------------------------------------------------------
216 
218  {
219  increment_();
220  return *this;
221  }
222 
223  // self_type operator ++(int)
224  // {
225  // auto cpy = *this;
226  // increment_();
227  // return cpy;
228  // }
229 
237  bool operator==( self_type const& it ) const
238  {
239  return parent_ == it.parent_;
240  }
241 
242  bool operator!=( self_type const& it ) const
243  {
244  return !(*this == it);
245  }
246 
247  // -------------------------------------------------------------------------
248  // Data Access
249  // -------------------------------------------------------------------------
250 
283  std::vector<std::string> rg_tags( bool all_header_tags = false ) const;
284 
289  size_t sample_size() const;
290 
291  // -------------------------------------------------------------------------
292  // Internal Members
293  // -------------------------------------------------------------------------
294 
295  private:
296 
300  void increment_();
301 
309  void process_base_( bam_pileup1_t const* p );
310 
318  size_t get_sample_index_( bam_pileup1_t const* p ) const;
319 
320  private:
321 
322  // Parent.
323  SamVariantInputStream const* parent_ = nullptr;
324 
325  // htslib specific file handling pointers during iteration.
326  // We put this in a shared ptr so that the iterator can be copied,
327  // but the htslib data gets only freed once all instances are done.
328  // This also allows PIMPL to avoid including the htslib headers here.
329  std::shared_ptr<SamFileHandle> handle_;
330 
331  // Current variant object, keeping the base tally of the current locus.
332  Variant current_variant_;
333 
334  };
335 
336  // ======================================================================================
337  // Main Class
338  // ======================================================================================
339 
340  // -------------------------------------------------------------------------
341  // Constructors and Rule of Five
342  // -------------------------------------------------------------------------
343 
348  : SamVariantInputStream( std::string{} )
349  {}
350 
352  std::string const& input_file
353  )
354  : SamVariantInputStream( input_file, std::unordered_set<std::string>{}, false )
355  {}
356 
358  std::string const& input_file,
359  std::unordered_set<std::string> const& rg_tag_filter,
360  bool inverse_rg_tag_filter = false
361  );
362 
363  ~SamVariantInputStream() = default;
364 
365  SamVariantInputStream( SamVariantInputStream const& ) = default;
367 
370 
371  // -------------------------------------------------------------------------
372  // Iteration
373  // -------------------------------------------------------------------------
374 
375  Iterator begin() const
376  {
377  // Better error messaging that what htslib would give us if the file did not exist.
378  // We check here, as input_file() allows to change the file after construction,
379  // so we only do the check once we know that we are good to go.
380  std::string err_str;
381  if( ! utils::file_is_readable( input_file_, err_str )) {
382  throw std::runtime_error(
383  "Cannot open input sam/bam/cram file '" + input_file_ + "': " + err_str
384  );
385  }
386 
387  return Iterator( this );
388  }
389 
390  Iterator end() const
391  {
392  return Iterator();
393  }
394 
395  // -------------------------------------------------------------------------
396  // Basic Input Settings
397  // -------------------------------------------------------------------------
398 
399  std::string const& input_file() const
400  {
401  return input_file_;
402  }
403 
410  self_type& input_file( std::string const& value )
411  {
412  input_file_ = value;
413  return *this;
414  }
415 
416  // -------------------------------------------------------------------------
417  // Flag Settings
418  // -------------------------------------------------------------------------
419 
420  uint32_t flags_include_all() const
421  {
422  return flags_include_all_;
423  }
424 
443  self_type& flags_include_all( uint32_t value )
444  {
445  flags_include_all_ = value;
446  return *this;
447  }
448 
449  uint32_t flags_include_any() const
450  {
451  return flags_include_any_;
452  }
453 
472  self_type& flags_include_any( uint32_t value )
473  {
474  flags_include_any_ = value;
475  return *this;
476  }
477 
478  uint32_t flags_exclude_all() const
479  {
480  return flags_exclude_all_;
481  }
482 
501  self_type& flags_exclude_all( uint32_t value )
502  {
503 
504  flags_exclude_all_ = value;
505  return *this;
506  }
507 
508  uint32_t flags_exclude_any() const
509  {
510  return flags_exclude_any_;
511  }
512 
531  self_type& flags_exclude_any( uint32_t value )
532  {
533  flags_exclude_any_ = value;
534  return *this;
535  }
536 
537  // -------------------------------------------------------------------------
538  // Region Filter
539  // -------------------------------------------------------------------------
540 
549  self_type& region_filter( std::shared_ptr<GenomeLocusSet> region_filter )
550  {
551  region_filter_ = region_filter;
552  return *this;
553  }
554 
555  // -------------------------------------------------------------------------
556  // Quality Settings
557  // -------------------------------------------------------------------------
558 
559  int min_map_qual() const
560  {
561  return min_map_qual_;
562  }
563 
571  self_type& min_map_qual( int value )
572  {
573  min_map_qual_ = value;
574  return *this;
575  }
576 
577  int min_base_qual() const
578  {
579  return min_base_qual_;
580  }
581 
588  self_type& min_base_qual( int value )
589  {
590  min_base_qual_ = value;
591  return *this;
592  }
593 
594  // -------------------------------------------------------------------------
595  // Depth Settings
596  // -------------------------------------------------------------------------
597 
598  int min_depth() const
599  {
600  return min_depth_;
601  }
602 
608  self_type& min_depth( int value )
609  {
610  min_depth_ = value;
611  return *this;
612  }
613 
614  int max_depth() const
615  {
616  return max_depth_;
617  }
618 
625  self_type& max_depth( int value )
626  {
627  max_depth_ = value;
628  return *this;
629  }
630 
632  {
633  return max_acc_depth_;
634  }
635 
647  {
648  max_acc_depth_ = value;
649  return *this;
650  }
651 
652  // -------------------------------------------------------------------------
653  // Read Group Settings
654  // -------------------------------------------------------------------------
655 
656  bool split_by_rg() const
657  {
658  return split_by_rg_;
659  }
660 
669  self_type& split_by_rg( bool value )
670  {
671  split_by_rg_ = value;
672  return *this;
673  }
674 
675  bool with_unaccounted_rg() const
676  {
677  return with_unaccounted_rg_;
678  }
679 
694  {
695  with_unaccounted_rg_ = value;
696  return *this;
697  }
698 
699  std::unordered_set<std::string> const& rg_tag_filter() const
700  {
701  return rg_tag_filter_;
702  }
703 
721  self_type& rg_tag_filter( std::unordered_set<std::string> const& value )
722  {
723  rg_tag_filter_ = value;
724  return *this;
725  }
726 
728  {
729  return inverse_rg_tag_filter_;
730  }
731 
738  {
739  inverse_rg_tag_filter_ = value;
740  return *this;
741  }
742 
743  // bool skip_empty_positions() const
744  // {
745  // return skip_empty_positions_;
746  // }
747  //
748  // self_type& skip_empty_positions( bool value )
749  // {
750  // skip_empty_positions_ = value;
751  // return *this;
752  // }
753 
754  // -------------------------------------------------------------------------
755  // Data Members
756  // -------------------------------------------------------------------------
757 
758 private:
759 
760  // Input data
761  std::string input_file_;
762 
763  // Settings
764 
765  // Read filtering flags, as used by htslib and samtools
766  uint32_t flags_include_all_ = 0;
767  uint32_t flags_include_any_ = 0;
768  uint32_t flags_exclude_all_ = 0;
769  uint32_t flags_exclude_any_ = 0;
770 
771  // Filter for which positions to consider or skip
772  std::shared_ptr<GenomeLocusSet> region_filter_;
773 
774  // Minimum mapping and base qualities
775  int min_map_qual_ = 0;
776  int min_base_qual_ = 0;
777 
778  // Read depth / coverage
779  int min_depth_ = 0;
780  int max_depth_ = 0;
781  int max_acc_depth_ = 0;
782 
783  bool split_by_rg_ = false;
784  bool with_unaccounted_rg_ = false;
785  std::unordered_set<std::string> rg_tag_filter_;
786  bool inverse_rg_tag_filter_ = false;
787 
788  // Unused / need to investiage if needed
789  // int max_indel_depth_ = 250;
790  // bool skip_empty_positions_ = true;
791  // bool detect_overlapping_read_pairs_ = false;
792 
793 };
794 
795 } // namespace population
796 } // namespace genesis
797 
798 #endif // htslib guard
799 #endif // include guard
genesis::population::SamVariantInputStream::min_base_qual
self_type & min_base_qual(int value)
Set the minimum phred-scaled per-base quality score for a nucleotide to be considered.
Definition: sam_variant_input_stream.hpp:588
genome_locus.hpp
genesis::population::SamVariantInputStream::difference_type
std::ptrdiff_t difference_type
Definition: sam_variant_input_stream.hpp:115
genesis::population::SamVariantInputStream::SamVariantInputStream
SamVariantInputStream(std::string const &input_file)
Definition: sam_variant_input_stream.hpp:351
fs.hpp
Provides functions for accessing the file system.
genesis::population::SamVariantInputStream::inverse_rg_tag_filter
bool inverse_rg_tag_filter() const
Definition: sam_variant_input_stream.hpp:727
genesis::population::SamVariantInputStream::pointer
value_type const * pointer
Definition: sam_variant_input_stream.hpp:113
genesis::population::SamVariantInputStream::Iterator::operator=
Iterator & operator=(self_type const &)=default
genesis::population::SamVariantInputStream
Input stream for SAM/BAM/CRAM files that produces a Variant per genome position.
Definition: sam_variant_input_stream.hpp:103
genesis::population::SamVariantInputStream::Iterator::sample_size
size_t sample_size() const
Return the size of the Variant::sample vector of SampleCounts that is produced by the iterator.
Definition: sam_variant_input_stream.cpp:732
genome_locus.hpp
genesis::population::SamVariantInputStream::flags_exclude_all
uint32_t flags_exclude_all() const
Definition: sam_variant_input_stream.hpp:478
genesis::population::SamVariantInputStream::Iterator::reference
value_type const & reference
Definition: sam_variant_input_stream.hpp:168
genesis::population::SamVariantInputStream::Iterator::~Iterator
~Iterator()=default
genesis::population::SamVariantInputStream::flags_exclude_any
self_type & flags_exclude_any(uint32_t value)
Do not use reads with any bits set in value present in the FLAG field of the read.
Definition: sam_variant_input_stream.hpp:531
genesis::population::SamVariantInputStream::split_by_rg
self_type & split_by_rg(bool value)
If set to true, instead of reading all mapped reads as a single sample, split them by the @RG read gr...
Definition: sam_variant_input_stream.hpp:669
genesis::population::SamVariantInputStream::Iterator::operator*
const value_type & operator*() const
Definition: sam_variant_input_stream.hpp:203
genesis::population::SamVariantInputStream::min_base_qual
int min_base_qual() const
Definition: sam_variant_input_stream.hpp:577
genesis::population::SamVariantInputStream::Iterator
Iterator over loci of the input sources.
Definition: sam_variant_input_stream.hpp:157
genesis::population::SamVariantInputStream::min_map_qual
self_type & min_map_qual(int value)
Set the minimum phred-scaled mapping quality score for a read in the input file to be considered.
Definition: sam_variant_input_stream.hpp:571
genesis::population::SamVariantInputStream::flags_include_all
self_type & flags_include_all(uint32_t value)
Only use reads with all bits set in value present in the FLAG field of the read.
Definition: sam_variant_input_stream.hpp:443
genesis::population::SamVariantInputStream::operator=
SamVariantInputStream & operator=(SamVariantInputStream const &)=default
genesis::population::SamVariantInputStream::end
Iterator end() const
Definition: sam_variant_input_stream.hpp:390
genesis::population::SamVariantInputStream::Iterator::pointer
value_type const * pointer
Definition: sam_variant_input_stream.hpp:167
genesis::population::SamVariantInputStream::input_file
std::string const & input_file() const
Definition: sam_variant_input_stream.hpp:399
genesis::population::SamVariantInputStream::min_depth
self_type & min_depth(int value)
Set the minimum read depth (coverage) at a given position to be considered.
Definition: sam_variant_input_stream.hpp:608
genesis::population::SamVariantInputStream::Iterator::SamFileHandle
friend SamFileHandle
Definition: sam_variant_input_stream.hpp:187
genesis::population::SamVariantInputStream::Iterator::operator->
const value_type * operator->() const
Definition: sam_variant_input_stream.hpp:193
sample_counts.hpp
genesis::population::SamVariantInputStream::with_unaccounted_rg
self_type & with_unaccounted_rg(bool value)
Decide whether to add a sample for reads without a read group, when splitting by @RG tag.
Definition: sam_variant_input_stream.hpp:693
genesis::population::SamVariantInputStream::Iterator::iterator_category
std::input_iterator_tag iterator_category
Definition: sam_variant_input_stream.hpp:169
genesis::population::SamVariantInputStream::inverse_rg_tag_filter
self_type & inverse_rg_tag_filter(bool value)
Reverse the meaning of the list of sample names given by rg_tag_filter().
Definition: sam_variant_input_stream.hpp:737
genesis::population::SamVariantInputStream::flags_include_all
uint32_t flags_include_all() const
Definition: sam_variant_input_stream.hpp:420
genesis::population::SamVariantInputStream::input_file
self_type & input_file(std::string const &value)
Set the input file.
Definition: sam_variant_input_stream.hpp:410
genesis::population::SamVariantInputStream::~SamVariantInputStream
~SamVariantInputStream()=default
genesis::population::SamVariantInputStream::flags_exclude_any
uint32_t flags_exclude_any() const
Definition: sam_variant_input_stream.hpp:508
genesis::population::SamVariantInputStream::Iterator::operator*
value_type & operator*()
Definition: sam_variant_input_stream.hpp:208
genesis::population::SamVariantInputStream::rg_tag_filter
self_type & rg_tag_filter(std::unordered_set< std::string > const &value)
Set the sample names used for filtering reads by their RG read group tag.
Definition: sam_variant_input_stream.hpp:721
genesis::population::SamVariantInputStream::min_map_qual
int min_map_qual() const
Definition: sam_variant_input_stream.hpp:559
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::SamVariantInputStream::min_depth
int min_depth() const
Definition: sam_variant_input_stream.hpp:598
genesis::population::SamVariantInputStream::begin
Iterator begin() const
Definition: sam_variant_input_stream.hpp:375
genesis::population::SamVariantInputStream::split_by_rg
bool split_by_rg() const
Definition: sam_variant_input_stream.hpp:656
genome_locus_set.hpp
genesis::utils::file_is_readable
bool file_is_readable(std::string const &filename)
Return whether a file is readable.
Definition: fs.cpp:108
genesis::population::SamVariantInputStream::max_depth
self_type & max_depth(int value)
Set the maximum read depth (coverage) at a given position to be considered.
Definition: sam_variant_input_stream.hpp:625
genesis::population::SamVariantInputStream::flags_exclude_all
self_type & flags_exclude_all(uint32_t value)
Do not use reads with all bits set in value present in the FLAG field of the read.
Definition: sam_variant_input_stream.hpp:501
genesis::population::SamVariantInputStream::max_depth
int max_depth() const
Definition: sam_variant_input_stream.hpp:614
genesis::population::SamVariantInputStream::Iterator::rg_tags
std::vector< std::string > rg_tags(bool all_header_tags=false) const
Get the list of read group RG tags as used in the iteration, or as found in the SAM/BAM/CRAM file.
Definition: sam_variant_input_stream.cpp:698
genesis::population::SamVariantInputStream::max_accumulation_depth
self_type & max_accumulation_depth(int value)
Set the maximum read depth (coverage) at a given position that is actually processed.
Definition: sam_variant_input_stream.hpp:646
genesis::population::SamVariantInputStream::flags_include_any
uint32_t flags_include_any() const
Definition: sam_variant_input_stream.hpp:449
genesis::population::SamVariantInputStream::SamVariantInputStream
SamVariantInputStream()
Create a default instance, with no input. This is also the past-the-end iterator.
Definition: sam_variant_input_stream.hpp:347
genesis::population::SamVariantInputStream::max_accumulation_depth
int max_accumulation_depth() const
Definition: sam_variant_input_stream.hpp:631
genesis::population::SamVariantInputStream::reference
value_type const & reference
Definition: sam_variant_input_stream.hpp:114
genesis::population::SamVariantInputStream::Iterator::SamVariantInputStream
friend SamVariantInputStream
Definition: sam_variant_input_stream.hpp:186
genesis::population::SamVariantInputStream::with_unaccounted_rg
bool with_unaccounted_rg() const
Definition: sam_variant_input_stream.hpp:675
variant.hpp
genesis::population::SamVariantInputStream::Iterator::operator++
self_type & operator++()
Definition: sam_variant_input_stream.hpp:217
genesis::population::SamVariantInputStream::flags_include_any
self_type & flags_include_any(uint32_t value)
Only use reads with any bits set in value present in the FLAG field of the read.
Definition: sam_variant_input_stream.hpp:472
genesis::population::SamVariantInputStream::Iterator::operator!=
bool operator!=(self_type const &it) const
Definition: sam_variant_input_stream.hpp:242
genesis::population::SamVariantInputStream::rg_tag_filter
std::unordered_set< std::string > const & rg_tag_filter() const
Definition: sam_variant_input_stream.hpp:699
bam_plp_t
struct bam_plp_s * bam_plp_t
Definition: sam_variant_input_stream.hpp:61
genesis::population::SamVariantInputStream::iterator_category
std::input_iterator_tag iterator_category
Definition: sam_variant_input_stream.hpp:116
genesis::population::SamVariantInputStream::Iterator::operator==
bool operator==(self_type const &it) const
Compare two iterators for equality.
Definition: sam_variant_input_stream.hpp:237
genesis::population::SamVariantInputStream::Iterator::operator->
value_type * operator->()
Definition: sam_variant_input_stream.hpp:198
genesis::population::SamVariantInputStream::region_filter
self_type & region_filter(std::shared_ptr< GenomeLocusSet > region_filter)
Set a region filter, so that only loci set in the loci are used, and all others are skipped.
Definition: sam_variant_input_stream.hpp:549