A library for working with phylogenetic data.
v0.25.0
vcf_common.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
31 #ifdef GENESIS_HTSLIB
32 
34 
35 extern "C" {
36  #include <htslib/hts.h>
37  #include <htslib/vcf.h>
38 }
39 
40 #include <cassert>
41 #include <cstring>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Typedefs and Enums
49 // =================================================================================================
50 
51 // VcfHeaderLine
52 static_assert(
53  static_cast<int>( VcfHeaderLine::kFilter ) == BCF_HL_FLT,
54  "Definitions of BCF_HL_FLT in htslib and of VcfHeaderLine::kFilter in genesis differ. "
55  "Please submit a bug report at https://github.com/lczech/genesis/issues"
56 );
57 static_assert(
58  static_cast<int>( VcfHeaderLine::kInfo ) == BCF_HL_INFO,
59  "Definitions of BCF_HL_INFO in htslib and of VcfHeaderLine::kInfo in genesis differ. "
60  "Please submit a bug report at https://github.com/lczech/genesis/issues"
61 );
62 static_assert(
63  static_cast<int>( VcfHeaderLine::kFormat ) == BCF_HL_FMT,
64  "Definitions of BCF_HL_FMT in htslib and of VcfHeaderLine::kFormat in genesis differ. "
65  "Please submit a bug report at https://github.com/lczech/genesis/issues"
66 );
67 static_assert(
68  static_cast<int>( VcfHeaderLine::kContig ) == BCF_HL_CTG,
69  "Definitions of BCF_HL_CTG in htslib and of VcfHeaderLine::kContig in genesis differ. "
70  "Please submit a bug report at https://github.com/lczech/genesis/issues"
71 );
72 static_assert(
73  static_cast<int>( VcfHeaderLine::kStructured ) == BCF_HL_STR,
74  "Definitions of BCF_HL_STR in htslib and of VcfHeaderLine::kStructured in genesis differ. "
75  "Please submit a bug report at https://github.com/lczech/genesis/issues"
76 );
77 static_assert(
78  static_cast<int>( VcfHeaderLine::kGeneric ) == BCF_HL_GEN,
79  "Definitions of BCF_HL_GEN in htslib and of VcfHeaderLine::kGeneric in genesis differ. "
80  "Please submit a bug report at https://github.com/lczech/genesis/issues"
81 );
82 
83 // VcfValueType
84 static_assert(
85  static_cast<int>( VcfValueType::kFlag ) == BCF_HT_FLAG,
86  "Definitions of BCF_HT_FLAG in htslib and of VcfValueType::kFlag in genesis differ. "
87  "Please submit a bug report at https://github.com/lczech/genesis/issues"
88 );
89 static_assert(
90  static_cast<int>( VcfValueType::kInteger ) == BCF_HT_INT,
91  "Definitions of BCF_HT_INT in htslib and of VcfValueType::kInteger in genesis differ. "
92  "Please submit a bug report at https://github.com/lczech/genesis/issues"
93 );
94 static_assert(
95  static_cast<int>( VcfValueType::kFloat ) == BCF_HT_REAL,
96  "Definitions of BCF_HT_REAL in htslib and of VcfValueType::kFloat in genesis differ. "
97  "Please submit a bug report at https://github.com/lczech/genesis/issues"
98 );
99 static_assert(
100  static_cast<int>( VcfValueType::kString ) == BCF_HT_STR,
101  "Definitions of BCF_HT_STR in htslib and of VcfValueType::kString in genesis differ. "
102  "Please submit a bug report at https://github.com/lczech/genesis/issues"
103 );
104 
105 // VcfValueSpecial
106 static_assert(
107  static_cast<int>( VcfValueSpecial::kFixed ) == BCF_VL_FIXED,
108  "Definitions of BCF_VL_FIXED in htslib and of VcfValueSpecial::kFixed in genesis differ. "
109  "Please submit a bug report at https://github.com/lczech/genesis/issues"
110 );
111 static_assert(
112  static_cast<int>( VcfValueSpecial::kVariable ) == BCF_VL_VAR,
113  "Definitions of BCF_VL_VAR in htslib and of VcfValueSpecial::kVariable in genesis differ. "
114  "Please submit a bug report at https://github.com/lczech/genesis/issues"
115 );
116 static_assert(
117  static_cast<int>( VcfValueSpecial::kAllele ) == BCF_VL_A,
118  "Definitions of BCF_VL_A in htslib and of VcfValueSpecial::kAllele in genesis differ. "
119  "Please submit a bug report at https://github.com/lczech/genesis/issues"
120 );
121 static_assert(
122  static_cast<int>( VcfValueSpecial::kGenotype ) == BCF_VL_G,
123  "Definitions of BCF_VL_G in htslib and of VcfValueSpecial::kGenotype in genesis differ. "
124  "Please submit a bug report at https://github.com/lczech/genesis/issues"
125 );
126 static_assert(
127  static_cast<int>( VcfValueSpecial::kReference ) == BCF_VL_R,
128  "Definitions of BCF_VL_R in htslib and of VcfValueSpecial::kReference in genesis differ. "
129  "Please submit a bug report at https://github.com/lczech/genesis/issues"
130 );
131 
132 // =================================================================================================
133 // Typedef and Enum Helpers
134 // =================================================================================================
135 
137 {
138  return vcf_value_type_to_string( static_cast<int>( ht_type ));
139 }
140 
141 std::string vcf_value_type_to_string( int ht_type )
142 {
143  switch( ht_type ) {
144  case BCF_HT_INT: {
145  return "Integer";
146  }
147  case BCF_HT_REAL: {
148  return "Float";
149  }
150  case BCF_HT_STR: {
151  return "String";
152  }
153  case BCF_HT_FLAG: {
154  return "Flag";
155  }
156  default: {
157  throw std::domain_error( "Invalid value type provided: " + std::to_string( ht_type ));
158  }
159  }
160 
161  // Cannot happen, but let's satisfy eager compilers.
162  assert( false );
163  return "Unknown";
164 }
165 
167 {
168  return vcf_value_special_to_string( static_cast<int>( vl_type_num ));
169 }
170 
171 std::string vcf_value_special_to_string( int vl_type_num )
172 {
173  switch( vl_type_num ) {
174  case BCF_VL_FIXED: {
175  return "fixed (n)";
176  }
177  case BCF_VL_VAR: {
178  return "variable (.)";
179  }
180  case BCF_VL_A: {
181  return "allele (A)";
182  }
183  case BCF_VL_G: {
184  return "genotype (G)";
185  }
186  case BCF_VL_R: {
187  return "reference (R)";
188  }
189  default: {
190  throw std::domain_error( "Invalid value number provided: " + std::to_string( vl_type_num ));
191  }
192  }
193 
194  // Cannot happen, but let's satisfy eager compilers.
195  assert( false );
196  return "Unknown";
197 }
198 
199 std::string vcf_hl_type_to_string( int hl_type )
200 {
201  switch( hl_type ) {
202  case BCF_HL_FLT: return "FILTER";
203  case BCF_HL_INFO: return "INFO";
204  case BCF_HL_FMT: return "FORMAT";
205  case BCF_HL_CTG: return "CONTIG";
206  case BCF_HL_STR: return "Structured header line";
207  case BCF_HL_GEN: return "Generic header line";
208  }
209  throw std::invalid_argument( "Invalid header line type: " + std::to_string( hl_type ));
210 }
211 
212 // =================================================================================================
213 // VCF Genotype Functions
214 // =================================================================================================
215 
216 std::string vcf_genotype_string( std::vector<VcfGenotype> const& genotypes )
217 {
218  std::string result;
219  for( size_t i = 0; i < genotypes.size(); ++i ) {
220  auto const& g = genotypes[i];
221 
222  if( i > 0 ) {
223  result += ( g.is_phased() ? "|" : "/" );
224  }
225  result += ( g.is_missing() ? "." : std::to_string( g.variant_index() ));
226  }
227  return result;
228 }
229 
230 size_t vcf_genotype_sum( std::vector<VcfGenotype> const& genotypes )
231 {
232  size_t result = 0;
233  for( auto const& gt : genotypes ) {
234  result += static_cast<size_t>( gt.is_alternative() );
235  }
236  assert( result <= genotypes.size() );
237  return result;
238 }
239 
240 // =================================================================================================
241 // VCF Genotype
242 // =================================================================================================
243 
245 {
246  return bcf_gt_allele( genotype_ );
247 }
248 
250 {
251  return bcf_gt_allele( genotype_ ) == 0;
252 }
253 
255 {
256  return bcf_gt_allele( genotype_ ) > 0;
257 }
258 
260 {
261  return bcf_gt_is_missing( genotype_ );
262 }
263 
265 {
266  return bcf_gt_is_phased( genotype_ );
267 }
268 
269 int32_t VcfGenotype::data() const
270 {
271  return genotype_;
272 }
273 
274 } // namespace population
275 } // namespace genesis
276 
277 #endif // htslib guard
genesis::population::VcfGenotype::is_reference
bool is_reference() const
True iff the called variant of this genotype is the REF allele.
Definition: vcf_common.cpp:249
genesis::population::VcfGenotype::is_missing
bool is_missing() const
True iff the variant call is missing for this genotype.
Definition: vcf_common.cpp:259
genesis::population::VcfGenotype::variant_index
int32_t variant_index() const
Return the index of the variant set for this genotype call.
Definition: vcf_common.cpp:244
genesis::population::VcfGenotype::is_phased
bool is_phased() const
True iff the called variant is phased.
Definition: vcf_common.cpp:264
genesis::population::VcfValueSpecial::kAllele
@ kAllele
genesis::population::VcfValueSpecial::kGenotype
@ kGenotype
genesis::population::vcf_hl_type_to_string
std::string vcf_hl_type_to_string(int hl_type)
Internal helper function to convert htslib-internal BCF_HL_* header line type values to their string ...
Definition: vcf_common.cpp:199
genesis::population::VcfValueType::kString
@ kString
genesis::population::VcfGenotype::data
int32_t data() const
Return the raw genotype value as used by htslib.
Definition: vcf_common.cpp:269
genesis::population::VcfHeaderLine::kFilter
@ kFilter
genesis::population::VcfValueSpecial
VcfValueSpecial
Specification for special markers for the number of values expected for key-value-pairs of VCF/BCF fi...
Definition: vcf_common.hpp:95
genesis::population::VcfValueSpecial::kVariable
@ kVariable
Variable number of possible values, or unknown, or unbounded. In VCF, this is denoted by '....
genesis::population::VcfValueType::kFloat
@ kFloat
genesis::population::VcfHeaderLine::kGeneric
@ kGeneric
genesis::population::VcfHeaderLine::kContig
@ kContig
genesis::population::VcfValueSpecial::kReference
@ kReference
vcf_common.hpp
genesis::population::VcfHeaderLine::kStructured
@ kStructured
genesis::population::vcf_genotype_sum
size_t vcf_genotype_sum(std::vector< VcfGenotype > const &genotypes)
Return the sum of genotypes for a set of VcfGenotype entries, typically used to construct a genotype ...
Definition: vcf_common.cpp:230
genesis::population::VcfValueType::kInteger
@ kInteger
genesis::population::VcfValueSpecial::kFixed
@ kFixed
Fixed number of values expected. In VCF, this is denoted simply by an integer number.
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::vcf_value_special_to_string
std::string vcf_value_special_to_string(VcfValueSpecial vl_type_num)
Definition: vcf_common.cpp:166
genesis::population::VcfHeaderLine::kFormat
@ kFormat
genesis::population::vcf_genotype_string
std::string vcf_genotype_string(std::vector< VcfGenotype > const &genotypes)
Return the VCF-like string representation of a set of VcfGenotype entries.
Definition: vcf_common.cpp:216
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: functions/genome_region.cpp:55
genesis::population::VcfGenotype::is_alternative
bool is_alternative() const
True iff the called variant of this genotype is not the REF, but one of the ALT alleles.
Definition: vcf_common.cpp:254
genesis::population::VcfValueType
VcfValueType
Specification for the data type of the values expected in key-value-pairs of VCF/BCF files.
Definition: vcf_common.hpp:78
genesis::population::VcfHeaderLine::kInfo
@ kInfo
genesis::population::vcf_value_type_to_string
std::string vcf_value_type_to_string(VcfValueType ht_type)
Definition: vcf_common.cpp:136
genesis::population::VcfValueType::kFlag
@ kFlag