A library for working with phylogenetic data.
v0.25.0
vcf_record.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMATS_VCF_RECORD_H_
2 #define GENESIS_POPULATION_FORMATS_VCF_RECORD_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2021 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #ifdef GENESIS_HTSLIB
35 
40 
41 #include <cstdint>
42 #include <string>
43 #include <vector>
44 
45 extern "C" {
46  // #include <htslib/hts.h>
47  // #include <htslib/vcf.h>
48 
49  struct bcf1_t;
50 }
51 
52 // =================================================================================================
53 // Forward Declarations
54 // =================================================================================================
55 
56 namespace genesis {
57 namespace population {
58 
59 class HtsFile;
60 class VcfHeader;
61 
62 // =================================================================================================
63 // VCF/BCF Record
64 // =================================================================================================
65 
108 {
109 public:
110 
111  // -------------------------------------------------------------------------
112  // Typedefs and Enums
113  // -------------------------------------------------------------------------
114 
121  enum class VariantType : int {
122  kRef = 0,
123  kSnp = 1,
124  kMnp = 2,
125  kIndel = 4,
126  kOther = 8,
127  kBreakend = 16, // Breakend
128  kOverlap = 32, // Overlapping deletion, ALT=*
129  };
130 
148  {
149  return ( static_cast<int>(a) & static_cast<int>(b) );
150  }
151 
152  // -------------------------------------------------------------------------
153  // Constructors and Rule of Five
154  // -------------------------------------------------------------------------
155 
159  VcfRecord();
160 
167  explicit VcfRecord( VcfHeader& header );
168 
175  VcfRecord( VcfHeader& header, ::bcf1_t* bcf1 );
176 
177  ~VcfRecord();
178 
179  VcfRecord( VcfRecord const& ) = delete;
180  VcfRecord( VcfRecord&& );
181 
182  VcfRecord& operator= ( VcfRecord const& ) = delete;
184 
185  void swap( VcfRecord& other );
186 
187  // -------------------------------------------------------------------------
188  // Accessors
189  // -------------------------------------------------------------------------
190 
194  ::bcf1_t* data()
195  {
196  return record_;
197  }
198 
202  ::bcf1_t const* data() const
203  {
204  return record_;
205  }
206 
214  {
215  return *header_;
216  }
217 
224  VcfHeader const& header() const
225  {
226  return *header_;
227  }
228 
237  void unpack() const;
238 
239  // -------------------------------------------------------------------------
240  // Simple Fixed Columns
241  // -------------------------------------------------------------------------
242 
246  std::string get_chromosome() const;
247 
253  size_t get_position() const;
254 
261  std::string get_id() const;
262 
270  std::string at() const;
271 
275  std::string get_reference() const;
276 
280  std::vector<std::string> get_alternatives() const;
281 
289  std::string get_alternative( size_t index ) const;
290 
302  size_t get_alternatives_count() const;
303 
312  std::vector<std::string> get_variants() const;
313 
324  std::string get_variant( size_t index ) const;
325 
332  size_t get_variant_count() const;
333 
355 
363  VariantType get_variant_type( size_t alt_index ) const;
364 
372  bool is_snp() const;
373 
377  double get_quality() const;
378 
379  // -------------------------------------------------------------------------
380  // Filter Column
381  // -------------------------------------------------------------------------
382 
395  std::vector<std::string> get_filter_ids() const;
396 
403  bool has_filter( std::string const& filter ) const;
404 
412  bool pass_filter() const;
413 
414  // -------------------------------------------------------------------------
415  // Info Column
416  // -------------------------------------------------------------------------
417 
428  std::vector<std::string> get_info_ids() const;
429 
433  bool has_info( std::string const& id ) const;
434 
438  bool has_info( char const* id ) const;
439 
446  void assert_info( std::string const& id ) const;
447 
451  void assert_info( char const* ) const;
452 
459  std::string get_info_string( std::string const& id ) const;
460 
469  void get_info_string( std::string const& id, std::string& destination ) const;
470 
478  std::vector<double> get_info_float( std::string const& id ) const;
479 
489  void get_info_float( std::string const& id, std::vector<double>& destination ) const;
490 
496  std::vector<int32_t> get_info_int( std::string const& id ) const;
497 
506  void get_info_int( std::string const& id, std::vector<int32_t>& destination ) const;
507 
514  bool get_info_flag( std::string const& id ) const;
515 
516  // -------------------------------------------------------------------------
517  // Format Column
518  // -------------------------------------------------------------------------
519 
530  std::vector<std::string> get_format_ids() const;
531 
535  bool has_format( std::string const& id ) const;
536 
540  bool has_format( char const* id ) const;
541 
548  void assert_format( std::string const& id ) const;
549 
553  void assert_format( char const* id ) const;
554 
555  // -------------------------------------------------------------------------
556  // Sample Columns
557  // -------------------------------------------------------------------------
558 
573 
581 
598 
610  VcfFormatIteratorString begin_format_string( std::string const& id ) const;
611 
619 
631 
638  VcfFormatIteratorInt begin_format_int( std::string const& id ) const;
639 
647 
654  genesis::utils::Range<VcfFormatIteratorInt> get_format_int( std::string const& id ) const;
655 
662  VcfFormatIteratorFloat begin_format_float( std::string const& id ) const;
663 
671 
679 
680  // -------------------------------------------------------------------------
681  // Modifiers
682  // -------------------------------------------------------------------------
683 
697  bool read_next( HtsFile& source );
698 
699  // -------------------------------------------------------------------------
700  // Internal Members
701  // -------------------------------------------------------------------------
702 
703 private:
704 
709  int get_info_ptr_( std::string const& id, int ht_type, void** dest, int* ndest ) const;
710 
711  // -------------------------------------------------------------------------
712  // Data Members
713  // -------------------------------------------------------------------------
714 
715 private:
716 
717  // Here, we only manage the record_ pointer instance. The header takes care of itself,
718  // and is only pointed to from here, but not managed.
719  VcfHeader* header_ = nullptr;
720  mutable ::bcf1_t* record_ = nullptr;
721 
722  // htslib wants to copy values all the time, so we reserve buffers to avoid reallocations.
723  mutable char* info_dest_string_ = nullptr;
724  mutable float* info_dest_float_ = nullptr;
725  mutable int32_t* info_dest_int_ = nullptr;
726  mutable int info_ndest_string_ = 0;
727  mutable int info_ndest_float_ = 0;
728  mutable int info_ndest_int_ = 0;
729 };
730 
731 } // namespace population
732 } // namespace genesis
733 
734 #endif // htslib guard
735 #endif // include guard
genesis::population::VcfRecord::is_snp
bool is_snp() const
Return whether this variant is a SNP.
Definition: vcf_record.cpp:297
genesis::population::VcfRecord::at
std::string at() const
Return a textual representation of the current record chromosome position.
Definition: vcf_record.cpp:189
genesis::population::VcfRecord::begin_format_float
VcfFormatIteratorFloat begin_format_float(std::string const &id) const
Get the begin iterator over the samples that accesses a certain FORMAT id as a float value.
Definition: vcf_record.cpp:572
genesis::population::VcfRecord::get_variant_type
VariantType get_variant_type(size_t alt_index) const
Get the variant type of a particular alternative allele/sequence.
Definition: vcf_record.cpp:280
genesis::population::VcfRecord::header
VcfHeader & header()
Return the VcfHeader instance associated with this record.
Definition: vcf_record.hpp:213
genesis::population::VcfRecord::get_format_ids
std::vector< std::string > get_format_ids() const
Get the list of all format IDs (FORMAT column) that the record contains.
Definition: vcf_record.cpp:477
genesis::population::VcfRecord::operator&
friend bool operator&(VcfRecord::VariantType a, VcfRecord::VariantType b)
And-operator for VariantTypes.
Definition: vcf_record.hpp:147
genesis::population::VcfRecord::assert_info
void assert_info(std::string const &id) const
Assert that an INFO entry with a given id is present in the record.
Definition: vcf_record.cpp:375
genesis::population::VcfRecord::operator=
VcfRecord & operator=(VcfRecord const &)=delete
genesis::population::VcfRecord::VariantType::kSnp
@ kSnp
genesis::population::VcfRecord::header
VcfHeader const & header() const
Return the VcfHeader instance associated with this record.
Definition: vcf_record.hpp:224
genesis::population::VcfRecord::begin_format_genotype
VcfFormatIteratorGenotype begin_format_genotype() const
Get the begin iterator over the samples that accesses the FORMAT genotype (GT field/key/id) as a set ...
Definition: vcf_record.cpp:517
genesis::population::VcfRecord::get_info_string
std::string get_info_string(std::string const &id) const
Return the info value for the given key id as a string.
Definition: vcf_record.cpp:389
genesis::population::VcfRecord::assert_format
void assert_format(std::string const &id) const
Assert that an FORMAT entry with a given id is present in the record.
Definition: vcf_record.cpp:499
genesis::population::VcfRecord::get_info_flag
bool get_info_flag(std::string const &id) const
Return whehter an INFO flag is set, that is, whether the info value for a given key id is present in ...
Definition: vcf_record.cpp:467
genesis::population::VcfRecord::VariantType::kBreakend
@ kBreakend
genesis::population::VcfRecord::get_id
std::string get_id() const
Get the ID string of the variant (ID, third column of the line).
Definition: vcf_record.cpp:183
genesis::population::VcfRecord::begin_format_int
VcfFormatIteratorInt begin_format_int(std::string const &id) const
Get the begin iterator over the samples that accesses a certain FORMAT id as an int value.
Definition: vcf_record.cpp:553
genesis::population::VcfRecord::end_format_string
VcfFormatIteratorString end_format_string() const
Get the end iterator over the samples that accesses a certain FORMAT id as a string value.
Definition: vcf_record.cpp:539
genesis::population::VcfRecord::get_alternative
std::string get_alternative(size_t index) const
Get a particular alternative allele (ALT, fifth column of the line).
Definition: vcf_record.cpp:217
genesis::population::VcfRecord::get_alternatives
std::vector< std::string > get_alternatives() const
Get the alternative alleles/sequences of the variant (ALT, fifth column of the line).
Definition: vcf_record.cpp:205
genesis::population::VcfRecord::end_format_int
VcfFormatIteratorInt end_format_int() const
Get the end iterator over the samples that accesses a certain FORMAT id as an int value.
Definition: vcf_record.cpp:558
vcf_header.hpp
genesis::population::VcfRecord::pass_filter
bool pass_filter() const
Return whether the record passes the filters, that is, whether PASS is set.
Definition: vcf_record.cpp:338
genesis::population::VcfRecord::get_variants
std::vector< std::string > get_variants() const
Shortcut to get both the reference (REF, fourth column of the line) and the alternative (ALT,...
Definition: vcf_record.cpp:241
genesis::population::VcfRecord::data
::bcf1_t * data()
Return the internal htslib bcf1_t record data struct pointer.
Definition: vcf_record.hpp:194
genesis::population::VcfFormatIterator
Iterate the FORMAT information for the samples in a SNP/variant line in a VCF/BCF file.
Definition: vcf_format_iterator.hpp:62
genesis::population::VcfRecord::swap
void swap(VcfRecord &other)
Definition: vcf_record.cpp:149
genesis::population::VcfRecord::get_info_ids
std::vector< std::string > get_info_ids() const
Get the list of all info IDs (INFO column) that the record contains.
Definition: vcf_record.cpp:351
genesis::population::VcfRecord::read_next
bool read_next(HtsFile &source)
Read the next record/line from the given source, and replace the content of this VcfRecord instance.
Definition: vcf_record.cpp:595
genesis::population::VcfRecord::VariantType::kOther
@ kOther
vcf_common.hpp
range.hpp
genesis::population::VcfRecord::begin_format_string
VcfFormatIteratorString begin_format_string(std::string const &id) const
Get the begin iterator over the samples that accesses a certain FORMAT id as a string value.
Definition: vcf_record.cpp:534
genesis::population::VcfRecord::get_variant_types
VariantType get_variant_types() const
Get the or'ed (union) value of all variant types of the alternative alleles/sequences of the record.
Definition: vcf_record.cpp:275
genesis::population::VcfRecord::VcfRecord
VcfRecord()
Create a default (empty) instance.
Definition: vcf_record.cpp:97
genesis::population::VcfRecord::get_chromosome
std::string get_chromosome() const
Get the name of a chromosome/contig/sequence (CHROM, first column of the line).
Definition: vcf_record.cpp:170
vcf_format_iterator.hpp
genesis::utils::Range
Simple wrapper for typical begin() and end() iterators, to be used in range-based for loops.
Definition: range.hpp:46
genesis::population::VcfRecord::get_format_genotype
genesis::utils::Range< VcfFormatIteratorGenotype > get_format_genotype() const
Get an iterator pair over the samples that accesses the FORMAT genotype (GT field/key/id) as a set of...
Definition: vcf_record.cpp:527
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::VcfRecord::get_reference
std::string get_reference() const
Get the reference allele/sequence of the variant (REF, fourth column of the line).
Definition: vcf_record.cpp:195
genesis::population::VcfRecord::has_format
bool has_format(std::string const &id) const
Return whether the record has a given FORMAT id present.
Definition: vcf_record.cpp:487
genesis::population::VcfRecord::get_format_string
genesis::utils::Range< VcfFormatIteratorString > get_format_string(std::string const &id) const
Get an iterator pair over the samples that accesses a certain FORMAT id as a string value.
Definition: vcf_record.cpp:544
genesis::population::VcfRecord::VariantType::kMnp
@ kMnp
genesis::population::VcfRecord::has_info
bool has_info(std::string const &id) const
Return whether the record has a given INFO id present.
Definition: vcf_record.cpp:361
genesis::population::VcfRecord::get_format_float
genesis::utils::Range< VcfFormatIteratorFloat > get_format_float(std::string const &id) const
Get an iterator pair over the samples that accesses a certain FORMAT id as an float value.
Definition: vcf_record.cpp:582
genesis::population::VcfRecord::get_position
size_t get_position() const
Get the position within the chromosome/contig (POS, second column of the line).
Definition: vcf_record.cpp:175
genesis::population::VcfRecord::end_format_genotype
VcfFormatIteratorGenotype end_format_genotype() const
Get the end iterator over the samples that accesses the FORMAT genotype (GT field/key/id) as a set of...
Definition: vcf_record.cpp:522
genesis::population::VcfRecord::get_variant_count
size_t get_variant_count() const
Get the total number of variants (REF and ALT alleles) in the record/line.
Definition: vcf_record.cpp:268
genesis::population::VcfRecord::get_filter_ids
std::vector< std::string > get_filter_ids() const
Get the list of all filter values (PASS or the names of the non-passing filters) that are applied to ...
Definition: vcf_record.cpp:311
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
genesis::population::VcfRecord::get_info_int
std::vector< int32_t > get_info_int(std::string const &id) const
Return the info value for the given key id as a vector of int.
Definition: vcf_record.cpp:443
genesis::population::VcfRecord::get_alternatives_count
size_t get_alternatives_count() const
Get the number of alternative alleles/sequences of the variant (ALT, fifth column of the line).
Definition: vcf_record.cpp:232
genesis::population::VcfRecord::end_format_float
VcfFormatIteratorFloat end_format_float() const
Get the end iterator over the samples that accesses a certain FORMAT id as a float value.
Definition: vcf_record.cpp:577
genesis::population::VcfRecord::VariantType
VariantType
Types of variants of alleles that can occur in a record.
Definition: vcf_record.hpp:121
genesis::population::VcfRecord::unpack
void unpack() const
Unpack the htslib bcf1_t record data.
Definition: vcf_record.cpp:165
genesis::population::VcfRecord::get_format_int
genesis::utils::Range< VcfFormatIteratorInt > get_format_int(std::string const &id) const
Get an iterator pair over the samples that accesses a certain FORMAT id as an int value.
Definition: vcf_record.cpp:563
genesis::population::VcfRecord::data
::bcf1_t const * data() const
Return the internal htslib bcf1_t record data struct pointer.
Definition: vcf_record.hpp:202
genesis::population::VcfRecord::get_info_float
std::vector< double > get_info_float(std::string const &id) const
Return the info value for the given key id as a vector of float/double.
Definition: vcf_record.cpp:419
genesis::population::VcfRecord::VariantType::kIndel
@ kIndel
genesis::population::VcfRecord::get_variant
std::string get_variant(size_t index) const
Get a particular variant (REF or ALT allele).
Definition: vcf_record.cpp:253
genesis::population::VcfHeader
Capture the information from a header of a VCF/BCF file.
Definition: vcf_header.hpp:102
genesis::population::HtsFile
Wrap an ::htsFile struct.
Definition: hts_file.hpp:56
genesis::population::VcfRecord::VariantType::kRef
@ kRef
genesis::population::VcfRecord::VariantType::kOverlap
@ kOverlap
genesis::population::VcfRecord::get_quality
double get_quality() const
Get the quality score (QUAL, sixth column of the line).
Definition: vcf_record.cpp:302
genesis::population::VcfRecord::~VcfRecord
~VcfRecord()
Definition: vcf_record.cpp:123
genesis::population::VcfRecord::has_filter
bool has_filter(std::string const &filter) const
Return whether the record has a given filter set.
Definition: vcf_record.cpp:321