A library for working with phylogenetic and population genetic data.
v0.32.0
vcf_common.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMAT_VCF_COMMON_H_
2 #define GENESIS_POPULATION_FORMAT_VCF_COMMON_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
34 #ifdef GENESIS_HTSLIB
35 
36 #include <cstdint>
37 #include <string>
38 #include <vector>
39 
43 
44 extern "C" {
45  // #include <htslib/vcf.h>
46  // #include <htslib/hts.h>
47 }
48 
49 namespace genesis {
50 namespace population {
51 
52 // =================================================================================================
53 // Forward Declarations
54 // =================================================================================================
55 
56 struct Variant;
57 class VcfRecord;
58 
59 // =================================================================================================
60 // Typedefs and Enums
61 // =================================================================================================
62 
71 enum class VcfHeaderLine : int
72 {
73  kFilter = 0,
74  kInfo = 1,
75  kFormat = 2,
76  kContig = 3,
77  kStructured = 4, // structured header line TAG=<A=..,B=..>
78  kGeneric = 5 // generic header line
79 };
80 
89 enum class VcfValueType : int
90 {
91  kFlag = 0,
92  kInteger = 1,
93  kFloat = 2,
94  kString = 3
95 };
96 
106 enum class VcfValueSpecial : int
107 {
117  kFixed = 0,
118 
123  kVariable = 1,
124 
128  kAllele = 2,
129 
134  kGenotype = 3,
135 
140  kReference = 4,
141 };
142 
152 {
153  std::string id;
156  int number;
157  std::string description;
158 };
159 
160 // =================================================================================================
161 // Typedef and Enum Helpers
162 // =================================================================================================
163 
164 std::string vcf_value_type_to_string( VcfValueType ht_type );
165 std::string vcf_value_type_to_string( int ht_type );
166 std::string vcf_value_special_to_string( VcfValueSpecial vl_type_num );
167 std::string vcf_value_special_to_string( int vl_type_num );
168 
173 std::string vcf_hl_type_to_string( int hl_type );
174 
175 // =================================================================================================
176 // Conversion Functions
177 // =================================================================================================
178 
199  VcfRecord const& record
200 );
201 
227  VcfRecord const& record,
228  bool use_allelic_depth = false
229 );
230 
243 GenomeLocusSet genome_locus_set_from_vcf_file( std::string const& file );
244 
263 GenomeRegionList genome_region_list_from_vcf_file( std::string const& file );
264 
272 void genome_region_list_from_vcf_file( std::string const& file, GenomeRegionList& target );
273 
274 // =================================================================================================
275 // VCF Genotype Functions
276 // =================================================================================================
277 
278 // Forward declare.
279 class VcfGenotype;
280 
288 std::string vcf_genotype_string( std::vector<VcfGenotype> const& genotypes );
289 
299 size_t vcf_genotype_sum( std::vector<VcfGenotype> const& genotypes );
300 
301 // =================================================================================================
302 // VCF Genotype
303 // =================================================================================================
304 
329 {
330 public:
331 
332  // -------------------------------------------------------------------------
333  // Constructors and Rule of Five
334  // -------------------------------------------------------------------------
335 
336  explicit VcfGenotype( int32_t genotype )
337  : genotype_(genotype)
338  {}
339 
340  ~VcfGenotype() = default;
341 
342  VcfGenotype( VcfGenotype const& ) = default;
343  VcfGenotype( VcfGenotype&& ) = default;
344 
345  VcfGenotype& operator= ( VcfGenotype const& ) = default;
346  VcfGenotype& operator= ( VcfGenotype&& ) = default;
347 
348  // -------------------------------------------------------------------------
349  // Access Functions
350  // -------------------------------------------------------------------------
351 
358  int32_t variant_index() const;
359 
365  bool is_reference() const;
366 
373  bool is_alternative() const;
374 
380  bool is_missing() const;
381 
392  bool is_phased() const;
393 
397  int32_t data() const;
398 
399  // -------------------------------------------------------------------------
400  // Data Members
401  // -------------------------------------------------------------------------
402 
403 private:
404 
405  int32_t genotype_;
406 };
407 
408 } // namespace population
409 } // namespace genesis
410 
411 #endif // htslib guard
412 #endif // include guard
genesis::population::VcfGenotype::is_reference
bool is_reference() const
True iff the called variant of this genotype is the REF allele.
Definition: vcf_common.cpp:707
genesis::population::VcfGenotype
Simple wrapper class for one genotype field for a sample.
Definition: vcf_common.hpp:328
genesis::population::VcfGenotype::is_missing
bool is_missing() const
True iff the variant call is missing for this genotype.
Definition: vcf_common.cpp:717
genesis::population::VcfGenotype::variant_index
int32_t variant_index() const
Return the index of the variant set for this genotype call.
Definition: vcf_common.cpp:702
genesis::population::VcfSpecification::description
std::string description
Definition: vcf_common.hpp:157
genesis::population::VcfGenotype::is_phased
bool is_phased() const
True iff the called variant is phased.
Definition: vcf_common.cpp:722
genesis::population::VcfValueSpecial::kAllele
@ kAllele
genesis::population::VcfValueSpecial::kGenotype
@ kGenotype
genesis::population::vcf_hl_type_to_string
std::string vcf_hl_type_to_string(int hl_type)
Internal helper function to convert htslib-internal BCF_HL_* header line type values to their string ...
Definition: vcf_common.cpp:208
genesis::population::VcfValueType::kString
@ kString
genesis::population::VcfGenotype::data
int32_t data() const
Return the raw genotype value as used by htslib.
Definition: vcf_common.cpp:727
genesis::population::genome_region_list_from_vcf_file
GenomeRegionList genome_region_list_from_vcf_file(std::string const &file)
Read a VCF file, and use its positions to create a GenomeRegionList.
Definition: vcf_common.cpp:600
genesis::population::GenomeLocusSet
List of positions/coordinates in a genome, for each chromosome.
Definition: genome_locus_set.hpp:75
genesis::population::VcfHeaderLine::kFilter
@ kFilter
genesis::population::VcfHeaderLine
VcfHeaderLine
Specification for the values determining header line types of VCF/BCF files.
Definition: vcf_common.hpp:71
genesis::population::VcfSpecification::id
std::string id
Definition: vcf_common.hpp:153
genesis::population::VcfValueSpecial
VcfValueSpecial
Specification for special markers for the number of values expected for key-value-pairs of VCF/BCF fi...
Definition: vcf_common.hpp:106
genesis::population::VcfValueSpecial::kVariable
@ kVariable
Variable number of possible values, or unknown, or unbounded. In VCF, this is denoted by '....
genesis::population::VcfValueType::kFloat
@ kFloat
genesis::population::VcfSpecification
Collect the four required keys that describe an INFO or FORMAT sub-field of VCF/BCF files.
Definition: vcf_common.hpp:151
genesis::population::VcfHeaderLine::kGeneric
@ kGeneric
genesis::population::VcfGenotype::VcfGenotype
VcfGenotype(int32_t genotype)
Definition: vcf_common.hpp:336
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:95
genesis::population::VcfHeaderLine::kContig
@ kContig
genesis::population::VcfSpecification::special
VcfValueSpecial special
Definition: vcf_common.hpp:155
genesis::population::convert_to_variant_as_individuals
Variant convert_to_variant_as_individuals(VcfRecord const &record, bool use_allelic_depth)
Convert a VcfRecord to a Variant, treating each sample as an individual, and combining them all into ...
Definition: vcf_common.cpp:453
genesis::population::genome_locus_set_from_vcf_file
GenomeLocusSet genome_locus_set_from_vcf_file(std::string const &file)
Read a VCF file, and use its positions to create a GenomeLocusSet.
Definition: vcf_common.cpp:580
genesis::population::VcfValueSpecial::kReference
@ kReference
genome_region.hpp
genesis::population::VcfHeaderLine::kStructured
@ kStructured
genesis::population::VcfGenotype::~VcfGenotype
~VcfGenotype()=default
genome_region_list.hpp
genesis::population::vcf_genotype_sum
size_t vcf_genotype_sum(std::vector< VcfGenotype > const &genotypes)
Return the sum of genotypes for a set of VcfGenotype entries, typically used to construct a genotype ...
Definition: vcf_common.cpp:688
genesis::population::VcfValueType::kInteger
@ kInteger
genesis::population::VcfValueSpecial::kFixed
@ kFixed
Fixed number of values expected. In VCF, this is denoted simply by an integer number.
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::VcfGenotype::operator=
VcfGenotype & operator=(VcfGenotype const &)=default
genesis::population::vcf_value_special_to_string
std::string vcf_value_special_to_string(VcfValueSpecial vl_type_num)
Definition: vcf_common.cpp:175
genome_locus_set.hpp
genesis::population::VcfSpecification::type
VcfValueType type
Definition: vcf_common.hpp:154
genesis::population::VcfHeaderLine::kFormat
@ kFormat
genesis::population::vcf_genotype_string
std::string vcf_genotype_string(std::vector< VcfGenotype > const &genotypes)
Return the VCF-like string representation of a set of VcfGenotype entries.
Definition: vcf_common.cpp:674
genesis::population::VcfGenotype::is_alternative
bool is_alternative() const
True iff the called variant of this genotype is not the REF, but one of the ALT alleles.
Definition: vcf_common.cpp:712
genesis::population::VcfValueType
VcfValueType
Specification for the data type of the values expected in key-value-pairs of VCF/BCF files.
Definition: vcf_common.hpp:89
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
genesis::population::VcfHeaderLine::kInfo
@ kInfo
genesis::population::vcf_value_type_to_string
std::string vcf_value_type_to_string(VcfValueType ht_type)
Definition: vcf_common.cpp:145
genesis::population::convert_to_variant_as_pool
Variant convert_to_variant_as_pool(VcfRecord const &record)
Convert a VcfRecord to a Variant, treating each sample column as a pool of individuals.
Definition: vcf_common.cpp:393
genesis::population::VcfSpecification::number
int number
Definition: vcf_common.hpp:156
genesis::population::VcfValueType::kFlag
@ kFlag