A library for working with phylogenetic and population genetic data.
v0.27.0
vcf_common.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMATS_VCF_COMMON_H_
2 #define GENESIS_POPULATION_FORMATS_VCF_COMMON_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #ifdef GENESIS_HTSLIB
35 
36 #include <cstdint>
37 #include <string>
38 #include <vector>
39 
42 
43 extern "C" {
44  // #include <htslib/vcf.h>
45  // #include <htslib/hts.h>
46 }
47 
48 namespace genesis {
49 namespace population {
50 
51 // =================================================================================================
52 // Forward Declarations
53 // =================================================================================================
54 
55 class Variant;
56 class VcfRecord;
57 
58 // =================================================================================================
59 // Typedefs and Enums
60 // =================================================================================================
61 
70 enum class VcfHeaderLine : int
71 {
72  kFilter = 0,
73  kInfo = 1,
74  kFormat = 2,
75  kContig = 3,
76  kStructured = 4, // structured header line TAG=<A=..,B=..>
77  kGeneric = 5 // generic header line
78 };
79 
88 enum class VcfValueType : int
89 {
90  kFlag = 0,
91  kInteger = 1,
92  kFloat = 2,
93  kString = 3
94 };
95 
105 enum class VcfValueSpecial : int
106 {
116  kFixed = 0,
117 
122  kVariable = 1,
123 
127  kAllele = 2,
128 
133  kGenotype = 3,
134 
139  kReference = 4,
140 };
141 
151 {
152  std::string id;
155  int number;
156  std::string description;
157 };
158 
159 // =================================================================================================
160 // Typedef and Enum Helpers
161 // =================================================================================================
162 
163 std::string vcf_value_type_to_string( VcfValueType ht_type );
164 std::string vcf_value_type_to_string( int ht_type );
165 std::string vcf_value_special_to_string( VcfValueSpecial vl_type_num );
166 std::string vcf_value_special_to_string( int vl_type_num );
167 
172 std::string vcf_hl_type_to_string( int hl_type );
173 
174 // =================================================================================================
175 // Conversion Functions
176 // =================================================================================================
177 
198  VcfRecord const& record
199 );
200 
226  VcfRecord const& record,
227  bool use_allelic_depth = false
228 );
229 
243 GenomeRegionList genome_region_list_from_vcf_file( std::string const& file );
244 
252 void genome_region_list_from_vcf_file( std::string const& file, GenomeRegionList& target );
253 
254 // =================================================================================================
255 // VCF Genotype Functions
256 // =================================================================================================
257 
258 // Forward declare.
259 class VcfGenotype;
260 
268 std::string vcf_genotype_string( std::vector<VcfGenotype> const& genotypes );
269 
279 size_t vcf_genotype_sum( std::vector<VcfGenotype> const& genotypes );
280 
281 // =================================================================================================
282 // VCF Genotype
283 // =================================================================================================
284 
309 {
310 public:
311 
312  // -------------------------------------------------------------------------
313  // Constructors and Rule of Five
314  // -------------------------------------------------------------------------
315 
316  explicit VcfGenotype( int32_t genotype )
317  : genotype_(genotype)
318  {}
319 
320  ~VcfGenotype() = default;
321 
322  VcfGenotype( VcfGenotype const& ) = default;
323  VcfGenotype( VcfGenotype&& ) = default;
324 
325  VcfGenotype& operator= ( VcfGenotype const& ) = default;
326  VcfGenotype& operator= ( VcfGenotype&& ) = default;
327 
328  // -------------------------------------------------------------------------
329  // Access Functions
330  // -------------------------------------------------------------------------
331 
338  int32_t variant_index() const;
339 
345  bool is_reference() const;
346 
353  bool is_alternative() const;
354 
360  bool is_missing() const;
361 
372  bool is_phased() const;
373 
377  int32_t data() const;
378 
379  // -------------------------------------------------------------------------
380  // Data Members
381  // -------------------------------------------------------------------------
382 
383 private:
384 
385  int32_t genotype_;
386 };
387 
388 } // namespace population
389 } // namespace genesis
390 
391 #endif // htslib guard
392 #endif // include guard
genesis::population::VcfGenotype::is_reference
bool is_reference() const
True iff the called variant of this genotype is the REF allele.
Definition: vcf_common.cpp:593
genesis::population::VcfGenotype
Simple wrapper class for one genotype field for a sample.
Definition: vcf_common.hpp:308
genesis::population::VcfGenotype::is_missing
bool is_missing() const
True iff the variant call is missing for this genotype.
Definition: vcf_common.cpp:603
genesis::population::VcfGenotype::variant_index
int32_t variant_index() const
Return the index of the variant set for this genotype call.
Definition: vcf_common.cpp:588
genesis::population::VcfSpecification::description
std::string description
Definition: vcf_common.hpp:156
genesis::population::VcfGenotype::is_phased
bool is_phased() const
True iff the called variant is phased.
Definition: vcf_common.cpp:608
genesis::population::VcfValueSpecial::kAllele
@ kAllele
genesis::population::VcfValueSpecial::kGenotype
@ kGenotype
genesis::population::vcf_hl_type_to_string
std::string vcf_hl_type_to_string(int hl_type)
Internal helper function to convert htslib-internal BCF_HL_* header line type values to their string ...
Definition: vcf_common.cpp:205
genesis::population::VcfValueType::kString
@ kString
genesis::population::VcfGenotype::data
int32_t data() const
Return the raw genotype value as used by htslib.
Definition: vcf_common.cpp:613
genesis::population::genome_region_list_from_vcf_file
GenomeRegionList genome_region_list_from_vcf_file(std::string const &file)
Read a VCF file, and use its positions to create a GenomeRegionList.
Definition: vcf_common.cpp:486
genesis::population::VcfHeaderLine::kFilter
@ kFilter
genesis::population::VcfHeaderLine
VcfHeaderLine
Specification for the values determining header line types of VCF/BCF files.
Definition: vcf_common.hpp:70
genesis::population::VcfSpecification::id
std::string id
Definition: vcf_common.hpp:152
genesis::population::VcfValueSpecial
VcfValueSpecial
Specification for special markers for the number of values expected for key-value-pairs of VCF/BCF fi...
Definition: vcf_common.hpp:105
genesis::population::VcfValueSpecial::kVariable
@ kVariable
Variable number of possible values, or unknown, or unbounded. In VCF, this is denoted by '....
genesis::population::VcfValueType::kFloat
@ kFloat
genesis::population::VcfSpecification
Collect the four required keys that describe an INFO or FORMAT sub-field of VCF/BCF files.
Definition: vcf_common.hpp:150
genesis::population::VcfHeaderLine::kGeneric
@ kGeneric
genesis::population::VcfGenotype::VcfGenotype
VcfGenotype(int32_t genotype)
Definition: vcf_common.hpp:316
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:82
genesis::population::VcfHeaderLine::kContig
@ kContig
genesis::population::VcfSpecification::special
VcfValueSpecial special
Definition: vcf_common.hpp:154
genesis::population::convert_to_variant_as_individuals
Variant convert_to_variant_as_individuals(VcfRecord const &record, bool use_allelic_depth)
Convert a VcfRecord to a Variant, treating each sample as an individual, and combining them all into ...
Definition: vcf_common.cpp:381
genesis::population::VcfValueSpecial::kReference
@ kReference
genome_region.hpp
genesis::population::VcfHeaderLine::kStructured
@ kStructured
genesis::population::VcfGenotype::~VcfGenotype
~VcfGenotype()=default
genome_region_list.hpp
genesis::population::vcf_genotype_sum
size_t vcf_genotype_sum(std::vector< VcfGenotype > const &genotypes)
Return the sum of genotypes for a set of VcfGenotype entries, typically used to construct a genotype ...
Definition: vcf_common.cpp:574
genesis::population::VcfValueType::kInteger
@ kInteger
genesis::population::VcfValueSpecial::kFixed
@ kFixed
Fixed number of values expected. In VCF, this is denoted simply by an integer number.
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::VcfGenotype::operator=
VcfGenotype & operator=(VcfGenotype const &)=default
genesis::population::vcf_value_special_to_string
std::string vcf_value_special_to_string(VcfValueSpecial vl_type_num)
Definition: vcf_common.cpp:172
genesis::population::VcfSpecification::type
VcfValueType type
Definition: vcf_common.hpp:153
genesis::population::VcfHeaderLine::kFormat
@ kFormat
genesis::population::vcf_genotype_string
std::string vcf_genotype_string(std::vector< VcfGenotype > const &genotypes)
Return the VCF-like string representation of a set of VcfGenotype entries.
Definition: vcf_common.cpp:560
genesis::population::VcfGenotype::is_alternative
bool is_alternative() const
True iff the called variant of this genotype is not the REF, but one of the ALT alleles.
Definition: vcf_common.cpp:598
genesis::population::VcfValueType
VcfValueType
Specification for the data type of the values expected in key-value-pairs of VCF/BCF files.
Definition: vcf_common.hpp:88
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
genesis::population::VcfHeaderLine::kInfo
@ kInfo
genesis::population::vcf_value_type_to_string
std::string vcf_value_type_to_string(VcfValueType ht_type)
Definition: vcf_common.cpp:142
genesis::population::convert_to_variant_as_pool
Variant convert_to_variant_as_pool(VcfRecord const &record)
Convert a VcfRecord to a Variant, treating each sample column as a pool of individuals.
Definition: vcf_common.cpp:275
genesis::population::VcfSpecification::number
int number
Definition: vcf_common.hpp:155
genesis::population::VcfValueType::kFlag
@ kFlag