A library for working with phylogenetic and population genetic data.
v0.27.0
vcf_header.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMATS_VCF_HEADER_H_
2 #define GENESIS_POPULATION_FORMATS_VCF_HEADER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2021 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #ifdef GENESIS_HTSLIB
35 
38 
39 #include <string>
40 #include <unordered_map>
41 #include <vector>
42 
43 extern "C" {
44  // #include <htslib/hts.h>
45  // #include <htslib/vcf.h>
46 
47  struct bcf_hdr_t;
48 }
49 
50 namespace genesis {
51 namespace population {
52 
53 // =================================================================================================
54 // Forward Declarations
55 // =================================================================================================
56 
57 class VcfRecord;
58 
59 template< typename S, typename T >
60 class VcfFormatIterator;
61 
62 // =================================================================================================
63 // VCF/BCF Header
64 // =================================================================================================
65 
103 {
104 public:
105 
106  // -------------------------------------------------------------------------
107  // Typedefs and Enums
108  // -------------------------------------------------------------------------
109 
110  // VcfRecord and VcfFormatIterator both need access to some htslib functions/definitions
111  // for which we'd have to include htslib headers in our headers, which we want to avoid.
112  // So instead, we wrap htslib internal functions here.
113  // We could make those free functions, but it seems too specialzied for general usage,
114  // so let's keep it in private scope for now, and make the classes friends instead.
115 
116  friend class VcfRecord;
117 
118  template< typename S, typename T >
119  friend class VcfFormatIterator;
120 
121  // -------------------------------------------------------------------------
122  // Constructors and Rule of Five
123  // -------------------------------------------------------------------------
124 
128  VcfHeader() = default;
129 
130  // /* *
131  // * @brief Create an instance that reads from an input file name.
132  // *
133  // * The header is read from the file, which is subsequently closed again. Hence, this
134  // * cannot be used to further operate on that file. It is merely meant for cases where
135  // * quick access to the VCF header information is needed, without parsing its body.
136  // */
137  // explicit VcfHeader( std::string const& file_name );
138 
144  explicit VcfHeader( std::string const& mode );
145 
152  explicit VcfHeader( HtsFile& hts_file );
153 
160  explicit VcfHeader( ::bcf_hdr_t* bcf_hdr );
161 
162  ~VcfHeader();
163 
164  VcfHeader( VcfHeader const& ) = delete;
165  VcfHeader( VcfHeader&& other );
166 
167  VcfHeader& operator= ( VcfHeader const& ) = delete;
168  VcfHeader& operator= ( VcfHeader&& other );
169 
170  // -------------------------------------------------------------------------
171  // General Accessors
172  // -------------------------------------------------------------------------
173 
177  ::bcf_hdr_t* data()
178  {
179  return header_;
180  }
181 
185  ::bcf_hdr_t const* data() const
186  {
187  return header_;
188  }
189 
193  std::string version() const;
194 
195  // -------------------------------------------------------------------------
196  // Chromosomes / Contigs / Sequences
197  // -------------------------------------------------------------------------
198 
204  std::vector<std::string> get_chromosomes() const;
205 
211  size_t get_chromosome_length( std::string const& chrom_name ) const;
212 
223  std::unordered_map<std::string, std::string> get_chromosome_values( std::string const& chrom_name ) const;
224 
225  // -------------------------------------------------------------------------
226  // Filter
227  // -------------------------------------------------------------------------
228 
238  std::vector<std::string> get_filter_ids() const;
239 
251  std::unordered_map<std::string, std::string> get_filter_values( std::string const& id ) const;
252 
256  void assert_filter( std::string const& id ) const;
257 
261  bool has_filter( std::string const& id ) const;
262 
263  // -------------------------------------------------------------------------
264  // Info
265  // -------------------------------------------------------------------------
266 
276  std::vector<std::string> get_info_ids() const;
277 
284  VcfSpecification get_info_specification( std::string const& id ) const;
285 
300  std::unordered_map<std::string, std::string> get_info_values( std::string const& id ) const;
301 
305  void assert_info( std::string const& id ) const;
306 
311  void assert_info( std::string const& id, VcfValueType type ) const;
312 
320  void assert_info( std::string const& id, VcfValueType type, VcfValueSpecial special ) const;
321 
330  void assert_info( std::string const& id, VcfValueType type, size_t number ) const;
331 
335  bool has_info( std::string const& id ) const;
336 
341  bool has_info( std::string const& id, VcfValueType type ) const;
342 
350  bool has_info( std::string const& id, VcfValueType type, VcfValueSpecial special ) const;
351 
360  bool has_info( std::string const& id, VcfValueType type, size_t number ) const;
361 
362  // -------------------------------------------------------------------------
363  // Format
364  // -------------------------------------------------------------------------
365 
375  std::vector<std::string> get_format_ids() const;
376 
383  VcfSpecification get_format_specification( std::string const& id ) const;
384 
399  std::unordered_map<std::string, std::string> get_format_values( std::string const& id ) const;
400 
404  void assert_format( std::string const& id ) const;
405 
410  void assert_format( std::string const& id, VcfValueType type ) const;
411 
419  void assert_format( std::string const& id, VcfValueType type, VcfValueSpecial special ) const;
420 
429  void assert_format( std::string const& id, VcfValueType type, size_t number ) const;
430 
434  bool has_format( std::string const& id ) const;
435 
440  bool has_format( std::string const& id, VcfValueType type ) const;
441 
449  bool has_format( std::string const& id, VcfValueType type, VcfValueSpecial special ) const;
450 
459  bool has_format( std::string const& id, VcfValueType type, size_t number ) const;
460 
461  // -------------------------------------------------------------------------
462  // Samples
463  // -------------------------------------------------------------------------
464 
468  size_t get_sample_count() const;
469 
476  std::string get_sample_name( size_t index ) const;
477 
483  size_t get_sample_index( std::string const& name ) const;
484 
491  std::vector<std::string> get_sample_names() const;
492 
503  void set_samples(
504  std::vector<std::string> const& sample_names,
505  bool inverse_sample_names = false
506  );
507 
508  // -------------------------------------------------------------------------
509  // Internal Helpers
510  // -------------------------------------------------------------------------
511 
512 private:
513 
518  std::vector<std::string> get_hrec_ids_( int hl_type ) const;
519 
524  std::unordered_map<std::string, std::string> get_hrec_values_(
525  int hl_type, std::string const& id
526  ) const;
527 
532  VcfSpecification get_specification_( int hl_type, std::string const& id) const;
533 
547  bool test_hl_entry_(
548  bool throwing,
549  int hl_type, std::string const& id,
550  bool with_type, VcfValueType type,
551  bool with_special, VcfValueSpecial special,
552  bool with_number, size_t number
553  ) const;
554 
574  static void check_value_return_code_(
575  ::bcf_hdr_t* header, std::string const& id, int ht_type, int hl_type, int return_value
576  );
577 
578  // -------------------------------------------------------------------------
579  // Data Members
580  // -------------------------------------------------------------------------
581 
582 private:
583 
584  ::bcf_hdr_t* header_ = nullptr;
585 };
586 
587 } // namespace population
588 } // namespace genesis
589 
590 #endif // htslib guard
591 #endif // include guard
genesis::population::VcfHeader::get_format_ids
std::vector< std::string > get_format_ids() const
Get a list of the ID names of all FORMAT fields in the header.
Definition: vcf_header.cpp:285
genesis::population::VcfHeader::get_format_values
std::unordered_map< std::string, std::string > get_format_values(std::string const &id) const
Get all key-value pairs describing a particular format field, given its ID.
Definition: vcf_header.cpp:295
genesis::population::VcfHeader::get_format_specification
VcfSpecification get_format_specification(std::string const &id) const
Get the required specification key-value-pairs for a given FORMAT entry.
Definition: vcf_header.cpp:290
genesis::population::VcfHeader::get_chromosome_length
size_t get_chromosome_length(std::string const &chrom_name) const
Get the length of a chromosome/contig/sequence, given its name.
Definition: vcf_header.cpp:174
genesis::population::VcfHeader::get_info_ids
std::vector< std::string > get_info_ids() const
Get a list of the ID names of all INFO fields in the header.
Definition: vcf_header.cpp:226
genesis::population::VcfValueSpecial
VcfValueSpecial
Specification for special markers for the number of values expected for key-value-pairs of VCF/BCF fi...
Definition: vcf_common.hpp:105
genesis::population::VcfSpecification
Collect the four required keys that describe an INFO or FORMAT sub-field of VCF/BCF files.
Definition: vcf_common.hpp:150
genesis::population::VcfHeader::get_chromosomes
std::vector< std::string > get_chromosomes() const
Get a list of the chromosome/contig/sequence names used in the file.
Definition: vcf_header.cpp:140
genesis::population::VcfHeader::assert_format
void assert_format(std::string const &id) const
Assert that an FORMAT entry with a given ID is defined in the header of the VCF/BCF file.
Definition: vcf_header.cpp:300
genesis::population::VcfHeader::get_info_values
std::unordered_map< std::string, std::string > get_info_values(std::string const &id) const
Get all key-value pairs describing a particular info header line, given its ID.
Definition: vcf_header.cpp:236
genesis::population::VcfHeader::get_info_specification
VcfSpecification get_info_specification(std::string const &id) const
Get the required specification key-value-pairs for a given INFO entry.
Definition: vcf_header.cpp:231
genesis::population::VcfHeader::get_sample_index
size_t get_sample_index(std::string const &name) const
Get the index of a sample, given its name.
Definition: vcf_header.cpp:361
genesis::population::VcfHeader::get_chromosome_values
std::unordered_map< std::string, std::string > get_chromosome_values(std::string const &chrom_name) const
Get all key-value-pairs describing a particular chromosome/contig/sequence, given its name.
Definition: vcf_header.cpp:193
genesis::population::VcfHeader::version
std::string version() const
Return the VCF/BCF version string, e.g. "VCFv4.2".
Definition: vcf_header.cpp:131
genesis::population::VcfHeader::get_filter_values
std::unordered_map< std::string, std::string > get_filter_values(std::string const &id) const
Get all key-value pairs describing a particular filter header line, given its ID.
Definition: vcf_header.cpp:207
genesis::population::VcfHeader::get_sample_names
std::vector< std::string > get_sample_names() const
Return a list of the sample names (column headers) of the VCF/BCF file.
Definition: vcf_header.cpp:373
genesis::population::VcfFormatIterator
Iterate the FORMAT information for the samples in a SNP/variant line in a VCF/BCF file.
Definition: vcf_format_iterator.hpp:62
vcf_common.hpp
genesis::population::VcfHeader::has_info
bool has_info(std::string const &id) const
Return whether an INFO entry with a given ID is defined in the header of the VCF/BCF file.
Definition: vcf_header.cpp:261
genesis::population::VcfHeader::get_sample_count
size_t get_sample_count() const
Get the number of samples (columns) in the file.
Definition: vcf_header.cpp:344
hts_file.hpp
genesis::population::VcfHeader::~VcfHeader
~VcfHeader()
Definition: vcf_header.cpp:100
genesis::population::VcfHeader::VcfHeader
VcfHeader()=default
Create a default (empty) instance.
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::VcfHeader::operator=
VcfHeader & operator=(VcfHeader const &)=delete
genesis::population::VcfHeader::data
::bcf_hdr_t * data()
Return the internal htslib ::bcf_hdr_t data struct pointer.
Definition: vcf_header.hpp:177
genesis::population::VcfHeader::assert_filter
void assert_filter(std::string const &id) const
Assert that a FILTER entry with a given ID is defined in the header of the VCF/BCF file.
Definition: vcf_header.cpp:212
genesis::population::VcfHeader::has_format
bool has_format(std::string const &id) const
Return whether a FORMAT entry with a given ID is defined in the header of the VCF/BCF file.
Definition: vcf_header.cpp:320
genesis::population::VcfValueType
VcfValueType
Specification for the data type of the values expected in key-value-pairs of VCF/BCF files.
Definition: vcf_common.hpp:88
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
genesis::population::VcfHeader::assert_info
void assert_info(std::string const &id) const
Assert that an INFO entry with a given ID is defined in the header of the VCF/BCF file.
Definition: vcf_header.cpp:241
genesis::population::VcfHeader::has_filter
bool has_filter(std::string const &id) const
Return whether a FILTER entry with a given ID is defined in the header of the VCF/BCF file.
Definition: vcf_header.cpp:217
genesis::population::VcfHeader::data
::bcf_hdr_t const * data() const
Return the internal htslib ::bcf_hdr_t data struct pointer.
Definition: vcf_header.hpp:185
genesis::population::VcfHeader::get_filter_ids
std::vector< std::string > get_filter_ids() const
Get a list of the ID names of all FILTER entries in the header.
Definition: vcf_header.cpp:202
genesis::population::VcfHeader
Capture the information from a header of a VCF/BCF file.
Definition: vcf_header.hpp:102
genesis::population::HtsFile
Wrap an ::htsFile struct.
Definition: hts_file.hpp:56
genesis::population::VcfHeader::set_samples
void set_samples(std::vector< std::string > const &sample_names, bool inverse_sample_names=false)
Speficy a subset of samples to be parsed.
Definition: vcf_header.cpp:384
genesis::population::VcfHeader::get_sample_name
std::string get_sample_name(size_t index) const
Get the name of a sample given its index.
Definition: vcf_header.cpp:350