A library for working with phylogenetic and population genetic data.
v0.32.0
vcf_common.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
31 #ifdef GENESIS_HTSLIB
32 
34 
43 
44 extern "C" {
45  #include <htslib/hts.h>
46  #include <htslib/vcf.h>
47 }
48 
49 #include <cassert>
50 #include <cstring>
51 #include <stdexcept>
52 
53 namespace genesis {
54 namespace population {
55 
56 // =================================================================================================
57 // Typedefs and Enums
58 // =================================================================================================
59 
60 // VcfHeaderLine
61 static_assert(
62  static_cast<int>( VcfHeaderLine::kFilter ) == BCF_HL_FLT,
63  "Definitions of BCF_HL_FLT in htslib and of VcfHeaderLine::kFilter in genesis differ. "
64  "Please submit a bug report at https://github.com/lczech/genesis/issues"
65 );
66 static_assert(
67  static_cast<int>( VcfHeaderLine::kInfo ) == BCF_HL_INFO,
68  "Definitions of BCF_HL_INFO in htslib and of VcfHeaderLine::kInfo in genesis differ. "
69  "Please submit a bug report at https://github.com/lczech/genesis/issues"
70 );
71 static_assert(
72  static_cast<int>( VcfHeaderLine::kFormat ) == BCF_HL_FMT,
73  "Definitions of BCF_HL_FMT in htslib and of VcfHeaderLine::kFormat in genesis differ. "
74  "Please submit a bug report at https://github.com/lczech/genesis/issues"
75 );
76 static_assert(
77  static_cast<int>( VcfHeaderLine::kContig ) == BCF_HL_CTG,
78  "Definitions of BCF_HL_CTG in htslib and of VcfHeaderLine::kContig in genesis differ. "
79  "Please submit a bug report at https://github.com/lczech/genesis/issues"
80 );
81 static_assert(
82  static_cast<int>( VcfHeaderLine::kStructured ) == BCF_HL_STR,
83  "Definitions of BCF_HL_STR in htslib and of VcfHeaderLine::kStructured in genesis differ. "
84  "Please submit a bug report at https://github.com/lczech/genesis/issues"
85 );
86 static_assert(
87  static_cast<int>( VcfHeaderLine::kGeneric ) == BCF_HL_GEN,
88  "Definitions of BCF_HL_GEN in htslib and of VcfHeaderLine::kGeneric in genesis differ. "
89  "Please submit a bug report at https://github.com/lczech/genesis/issues"
90 );
91 
92 // VcfValueType
93 static_assert(
94  static_cast<int>( VcfValueType::kFlag ) == BCF_HT_FLAG,
95  "Definitions of BCF_HT_FLAG in htslib and of VcfValueType::kFlag in genesis differ. "
96  "Please submit a bug report at https://github.com/lczech/genesis/issues"
97 );
98 static_assert(
99  static_cast<int>( VcfValueType::kInteger ) == BCF_HT_INT,
100  "Definitions of BCF_HT_INT in htslib and of VcfValueType::kInteger in genesis differ. "
101  "Please submit a bug report at https://github.com/lczech/genesis/issues"
102 );
103 static_assert(
104  static_cast<int>( VcfValueType::kFloat ) == BCF_HT_REAL,
105  "Definitions of BCF_HT_REAL in htslib and of VcfValueType::kFloat in genesis differ. "
106  "Please submit a bug report at https://github.com/lczech/genesis/issues"
107 );
108 static_assert(
109  static_cast<int>( VcfValueType::kString ) == BCF_HT_STR,
110  "Definitions of BCF_HT_STR in htslib and of VcfValueType::kString in genesis differ. "
111  "Please submit a bug report at https://github.com/lczech/genesis/issues"
112 );
113 
114 // VcfValueSpecial
115 static_assert(
116  static_cast<int>( VcfValueSpecial::kFixed ) == BCF_VL_FIXED,
117  "Definitions of BCF_VL_FIXED in htslib and of VcfValueSpecial::kFixed in genesis differ. "
118  "Please submit a bug report at https://github.com/lczech/genesis/issues"
119 );
120 static_assert(
121  static_cast<int>( VcfValueSpecial::kVariable ) == BCF_VL_VAR,
122  "Definitions of BCF_VL_VAR in htslib and of VcfValueSpecial::kVariable in genesis differ. "
123  "Please submit a bug report at https://github.com/lczech/genesis/issues"
124 );
125 static_assert(
126  static_cast<int>( VcfValueSpecial::kAllele ) == BCF_VL_A,
127  "Definitions of BCF_VL_A in htslib and of VcfValueSpecial::kAllele in genesis differ. "
128  "Please submit a bug report at https://github.com/lczech/genesis/issues"
129 );
130 static_assert(
131  static_cast<int>( VcfValueSpecial::kGenotype ) == BCF_VL_G,
132  "Definitions of BCF_VL_G in htslib and of VcfValueSpecial::kGenotype in genesis differ. "
133  "Please submit a bug report at https://github.com/lczech/genesis/issues"
134 );
135 static_assert(
136  static_cast<int>( VcfValueSpecial::kReference ) == BCF_VL_R,
137  "Definitions of BCF_VL_R in htslib and of VcfValueSpecial::kReference in genesis differ. "
138  "Please submit a bug report at https://github.com/lczech/genesis/issues"
139 );
140 
141 // =================================================================================================
142 // Typedef and Enum Helpers
143 // =================================================================================================
144 
146 {
147  return vcf_value_type_to_string( static_cast<int>( ht_type ));
148 }
149 
150 std::string vcf_value_type_to_string( int ht_type )
151 {
152  switch( ht_type ) {
153  case BCF_HT_INT: {
154  return "Integer";
155  }
156  case BCF_HT_REAL: {
157  return "Float";
158  }
159  case BCF_HT_STR: {
160  return "String";
161  }
162  case BCF_HT_FLAG: {
163  return "Flag";
164  }
165  default: {
166  throw std::domain_error( "Invalid value type provided: " + std::to_string( ht_type ));
167  }
168  }
169 
170  // Cannot happen, but let's satisfy eager compilers.
171  assert( false );
172  return "Unknown";
173 }
174 
176 {
177  return vcf_value_special_to_string( static_cast<int>( vl_type_num ));
178 }
179 
180 std::string vcf_value_special_to_string( int vl_type_num )
181 {
182  switch( vl_type_num ) {
183  case BCF_VL_FIXED: {
184  return "fixed (n)";
185  }
186  case BCF_VL_VAR: {
187  return "variable (.)";
188  }
189  case BCF_VL_A: {
190  return "allele (A)";
191  }
192  case BCF_VL_G: {
193  return "genotype (G)";
194  }
195  case BCF_VL_R: {
196  return "reference (R)";
197  }
198  default: {
199  throw std::domain_error( "Invalid value number provided: " + std::to_string( vl_type_num ));
200  }
201  }
202 
203  // Cannot happen, but let's satisfy eager compilers.
204  assert( false );
205  return "Unknown";
206 }
207 
208 std::string vcf_hl_type_to_string( int hl_type )
209 {
210  switch( hl_type ) {
211  case BCF_HL_FLT: return "FILTER";
212  case BCF_HL_INFO: return "INFO";
213  case BCF_HL_FMT: return "FORMAT";
214  case BCF_HL_CTG: return "CONTIG";
215  case BCF_HL_STR: return "Structured header line";
216  case BCF_HL_GEN: return "Generic header line";
217  }
218  throw std::invalid_argument( "Invalid header line type: " + std::to_string( hl_type ));
219 }
220 
221 // =================================================================================================
222 // Conversion Functions
223 // =================================================================================================
224 
235 std::pair<std::array<char, 6>, size_t> get_vcf_record_snp_ref_alt_chars_( VcfRecord const& record )
236 {
237  // Get all variants (REF and ALT), and check them. We manually add deletion here if ALT == ".",
238  // as this is not part of the variants provided from htslib.
239  // There are only 6 possible single nucleotide variants (ACGTN.), so we save the effort of
240  // creating an intermediate vector, and use a fixed size array and a counter instead.
241  // Also, we use htslib functions directly on the vcf record to not have to go through
242  // string allocations for each alternative.
243  record.unpack();
244  auto rec_data = record.data();
245 
246  // To avoid too much branching and allocation, we init the array so that we have all deletions
247  // initially, and hence do not need to overwrite in the case that we added that deletion
248  // manually to the counter.
249  auto vars = std::array<char, 6>{{ '.', '.', '.', '.', '.', '.' }};
250  if( rec_data->n_allele > 6 ) {
251  throw std::runtime_error(
252  "Invalid VCF Record that contains a REF or ALT sequence/allele with "
253  "invalid nucleitides where only `[ACGTN.]` are allowed, at " +
254  record.get_chromosome() + ":" + std::to_string( record.get_position() )
255  );
256  }
257 
258  // Now store all single nucleotide alleles that are in the record
259  // (we only count up to the actual number that is there, so that the missing deletion [in case
260  // that this record has a deletion] is not touched).
261  for( size_t i = 0; i < rec_data->n_allele; ++i ) {
262  if( std::strlen( rec_data->d.allele[i] ) != 1 ) {
263  throw std::runtime_error(
264  "Cannot convert VcfRecord to Variant, as one of the VcfRecord REF or ALT "
265  "sequences/alleles is not a single nucleotide (it is not a SNP), at " +
266  record.get_chromosome() + ":" + std::to_string( record.get_position() ) +
267  ". At the time being, we are not supporting indels and other such variants."
268  );
269  }
270  vars[i] = *rec_data->d.allele[i];
271  }
272 
273  return { vars, rec_data->n_allele };
274 }
275 
280  VcfRecord const& record,
281  std::pair<std::array<char, 6>, size_t> const& snp_chars,
282  VcfFormatIteratorInt const& sample_ad,
283  SampleCounts& sample
284 ) {
285  assert( sample_ad.valid_value_count() == 0 || sample_ad.valid_value_count() == snp_chars.second );
286  for( size_t i = 0; i < sample_ad.valid_value_count(); ++i ) {
287 
288  // Get the nucleotide and its count.
289  auto const cnt = sample_ad.get_value_at(i);
290  if( cnt < 0 ) {
291  throw std::runtime_error(
292  "Invalid VCF Record with FORMAT field 'AD' value < 0 for a sample, at " +
293  record.get_chromosome() + ":" + std::to_string( record.get_position() )
294  );
295  }
296 
297  // Add it to the respective count variable of the sample.
298  switch( snp_chars.first[i] ) {
299  case 'a':
300  case 'A': {
301  sample.a_count = cnt;
302  break;
303  }
304  case 'c':
305  case 'C': {
306  sample.c_count = cnt;
307  break;
308  }
309  case 'g':
310  case 'G': {
311  sample.g_count = cnt;
312  break;
313  }
314  case 't':
315  case 'T': {
316  sample.t_count = cnt;
317  break;
318  }
319  case 'n':
320  case 'N': {
321  sample.n_count = cnt;
322  break;
323  }
324  case '*': {
325  sample.d_count = cnt;
326  break;
327  }
328  default: {
329  throw std::runtime_error(
330  "Invalid VCF Record that contains a REF or ALT sequence/allele with "
331  "invalid nucleitide `" + std::string( 1, snp_chars.first[i] ) +
332  "` where only `[ACGTN*]` are allowed, at " +
333  record.get_chromosome() + ":" + std::to_string( record.get_position() )
334  );
335  }
336  }
337  }
338 }
339 
345  VcfRecord const& record,
346  Variant& variant
347 ) {
348  // Helper to avoid having the same error string twice.
349  auto throw_invalid_gt_size_ = [&](){
350  throw std::runtime_error(
351  "Invalid VCF Record with number of samples in the record (" +
352  std::to_string( variant.samples.size() ) +
353  ") not equal to the number of GT fields, at " +
354  record.get_chromosome() + ":" + std::to_string( record.get_position() )
355  );
356  };
357 
358  // Now iterate the genotypes and see if any is missing. If so, we set the status accordingly.
359  size_t sample_idx = 0;
360  size_t missing_cnt = 0;
361  for( auto const& sample_gt : record.get_format_genotype() ) {
362  if( sample_idx >= variant.samples.size() ) {
363  throw_invalid_gt_size_();
364  }
365 
366  // If any of the genotypes is missing, we consider the whole sample to be missing.
367  bool is_missing = false;
368  for( size_t i = 0; i < sample_gt.valid_value_count(); ++i ) {
369  auto const gt = sample_gt.get_value_at(i);
370  if( gt.is_missing() ) {
371  is_missing = true;
372  break;
373  }
374  }
375  if( is_missing ) {
376  ++missing_cnt;
377  variant.samples[sample_idx].status.set( SampleCountsFilterTag::kMissing );
378  }
379  ++sample_idx;
380  }
381 
382  // We need to have exactly one set of GT values per sample.
383  if( sample_idx != variant.samples.size() ) {
384  throw_invalid_gt_size_();
385  }
386 
387  // If all samples are missing, so is the whole variant.
388  if( missing_cnt == variant.samples.size() ) {
390  }
391 }
392 
394 {
395  // Error check.
396  if( ! record.has_format( "AD" )) {
397  throw std::runtime_error(
398  "Cannot convert VcfRecord to Variant, as the VcfRecord does not have "
399  "the required FORMAT field 'AD' for alleleic depth."
400  );
401  }
402 
403  // Get the ref and alt chars of the SNP.
404  auto const snp_chars = get_vcf_record_snp_ref_alt_chars_( record );
405 
406  // Prepare common fields of the result.
407  // For the reference base, we use the first nucleotide of the first variant (REF);
408  // above, we have ensured that this exists and is in fact a single nucleotide only.
409  // Same for the alternative base, where we use the first ALT in the record.
410  Variant result;
411  result.chromosome = record.get_chromosome();
412  result.position = record.get_position();
413  result.reference_base = utils::to_upper( snp_chars.first[0] );
414  result.alternative_base = utils::to_upper( snp_chars.first[1] );
415  // TODO the alt base above is only reasonable for biallelic SNPs
416 
417  // Process the sample AD counts that are present in the VCF record line.
418  result.samples.reserve( record.header().get_sample_count() );
419  for( auto const& sample_ad : record.get_format_int("AD") ) {
420  if( sample_ad.valid_value_count() > 0 && sample_ad.valid_value_count() != snp_chars.second ) {
421  throw std::runtime_error(
422  "Invalid VCF Record that contains " + std::to_string( snp_chars.second ) +
423  " REF and ALT sequences/alleles, but its FORMAT field 'AD' only contains " +
424  std::to_string( sample_ad.valid_value_count() ) + " entries, at " +
425  record.get_chromosome() + ":" + std::to_string( record.get_position() )
426  );
427  }
428 
429  // Go through all REF and ALT entries and their respective FORMAT 'AD' counts,
430  // and sum them up, storing them in a new SampleCounts instance at the end of the vector.
431  result.samples.emplace_back();
432  auto& sample = result.samples.back();
433  convert_to_variant_as_pool_tally_bases_( record, snp_chars, sample_ad, sample );
434  }
435 
436  // Proof check the number of samples that were found.
437  if( result.samples.size() != record.header().get_sample_count() ) {
438  throw std::runtime_error(
439  "Invalid VCF Record with number of samples in the record (" +
440  std::to_string( result.samples.size() ) + ") not equal to the number of samples given "
441  "in the VCF header (" + std::to_string( record.header().get_sample_count() ) + "), at " +
442  record.get_chromosome() + ":" + std::to_string( record.get_position() )
443  );
444  }
445 
446  // Set any samples as missing if there is no genotype. We still keep the above AD settings,
447  // so that missing data is still filled in as far as possible.
449 
450  return result;
451 }
452 
454  VcfRecord const& record,
455  bool use_allelic_depth
456 ) {
457  Variant result;
458 
459  // Short solution for when we want to use the AD field:
460  // Simply re-use the pool approach, and merge into one SampleCounts.
461  if( use_allelic_depth ) {
462  result = convert_to_variant_as_pool( record );
463  result.samples = std::vector<SampleCounts>{ merge(
465  )};
466  return result;
467  }
468 
469  // Here we treat each individual just by counting genotypes.
470  record.unpack();
471 
472  // Error check.
473  if( ! record.has_format( "GT" )) {
474  throw std::runtime_error(
475  "Cannot convert VcfRecord to Variant, as the VcfRecord does not have "
476  "the required FORMAT field 'GT' for genotypes."
477  );
478  }
479 
480  // Get the ref and alt chars of the SNP.
481  auto const snp_chars = get_vcf_record_snp_ref_alt_chars_( record );
482 
483  // Prepare common fields of the result. Same as convert_to_variant_as_pool(), see there.
484  result.chromosome = record.get_chromosome();
485  result.position = record.get_position();
486  result.reference_base = utils::to_upper( snp_chars.first[0] );
487  result.alternative_base = utils::to_upper( snp_chars.first[1] );
488  // TODO the alt base above is only reasonable for biallelic SNPs
489 
490  // We merge everything into one sample, representing the individuals as a pool.
491  result.samples.resize( 1 );
492  auto& sample = result.samples.back();
493 
494  // Keep track of how many genotypes we processed, and how many of them are missing.
495  size_t total_cnt = 0;
496  size_t missing_cnt = 0;
497 
498  // Go through all sample columns of the VCF, examining their GT field.
499  for( auto const& sample_gt : record.get_format_genotype() ) {
500 
501  // Go through all REF and ALT entries and their respective GT values for the current sample.
502  for( size_t i = 0; i < sample_gt.valid_value_count(); ++i ) {
503  ++total_cnt;
504 
505  // Get the genoptype and immediately convert to the index
506  // that we can look up in the snp array.
507  auto const gt = sample_gt.get_value_at(i);
508  auto const gt_int = gt.variant_index();
509 
510  // If the VCF is not totally messed up, this needs to be within the number of
511  // REF and ALT nucleotides (or -1 for deletions); check that.
512  if( !( gt_int < 0 || static_cast<size_t>(gt_int) < snp_chars.second )) {
513  throw std::runtime_error(
514  "Invalid VCF Record that contains an index " + std::to_string( gt_int ) +
515  " into the genotype list that does not exist, at " +
516  record.get_chromosome() + ":" + std::to_string( record.get_position() )
517  );
518  }
519 
520  // Special case for missing variant calls: nothing to add.
521  if( gt.is_missing() ) {
522  ++missing_cnt;
523  continue;
524  }
525 
526  // Special case deletion. The genotype value stored in VCF is -1 in that case.
527  if( gt_int < 0 ) {
528  ++sample.d_count;
529  continue;
530  }
531 
532  // Use the index to get what nucleotide the genotype is, and increment the count.
533  switch( snp_chars.first[gt_int] ) {
534  case 'a':
535  case 'A': {
536  ++sample.a_count;
537  break;
538  }
539  case 'c':
540  case 'C': {
541  ++sample.c_count;
542  break;
543  }
544  case 'g':
545  case 'G': {
546  ++sample.g_count;
547  break;
548  }
549  case 't':
550  case 'T': {
551  ++sample.t_count;
552  break;
553  }
554  case 'n':
555  case 'N': {
556  ++sample.n_count;
557  break;
558  }
559  default: {
560  throw std::runtime_error(
561  "Invalid VCF Record that contains a REF or ALT sequence/allele with "
562  "invalid nucleitide `" + std::string( 1, snp_chars.first[i] ) +
563  "` where only `[ACGTN.]` are allowed, at " +
564  record.get_chromosome() + ":" + std::to_string( record.get_position() )
565  );
566  }
567  }
568  }
569  }
570 
571  // If everything was missing, we set the filters accordingly.
572  if( missing_cnt == total_cnt ) {
573  sample.status.set( SampleCountsFilterTag::kMissing );
575  }
576 
577  return result;
578 }
579 
581 {
582  // Simpler version than genome_region_list_from_vcf_file(), as we do not need to keep track
583  // of regions here... simply add all the positions individually to the set.
584  GenomeLocusSet result;
585 
586  // Open and read file, without expecting it to be sorted.
587  // We could try some shenannigans here to keep track of the chromosome, in order to avoid
588  // having to find it each time in the genome locus set. However, the genome locus set add
589  // function also takes care of resizing the bitvector if needed, and adds some safety checks,
590  // so it's good to rely on that, even though it is slightly slower. The VCF parsing is
591  // likely the bottleneck anyway.
592  auto it = VcfInputStream( file, false );
593  while( it ) {
594  result.add( it.record().get_chromosome(), it.record().get_position() );
595  ++it;
596  }
597  return result;
598 }
599 
601 {
602  GenomeRegionList result;
603  genome_region_list_from_vcf_file( file, result );
604  return result;
605 }
606 
607 void genome_region_list_from_vcf_file( std::string const& file, GenomeRegionList& target )
608 {
609  // Prepare bookkeeping. We need the chromosome, the position where we started the current
610  // interval, and the position where we are at in the current interval.
611  std::string cur_chr;
612  size_t beg_pos = 0;
613  size_t cur_pos = 0;
614 
615  auto insert_ = [&]( std::string const& chr, size_t beg, size_t end ){
616  if( chr.empty() ) {
617  return;
618  }
619  assert( beg > 0 && end > 0 );
620  assert( beg <= end );
621 
622  // We add the interval, using the merge flag,
623  // to make sure that even unsorted VCFs produce consecutive, fully merged regions.
624  target.add( chr, beg, end, true );
625  };
626 
627  // Open and read file, without expecting it to be sorted.
628  auto it = VcfInputStream( file, false );
629  while( it ) {
630  if( it.record().get_chromosome() == cur_chr ) {
631  // We are still within the same chromosome.
632 
633  // If we did not move (can happen if multiple variants are reported at the same position),
634  // or moved exactly one position, we are still in the same interval.
635  // We could simply use the insert_overlap functionality of the IntervalTree here
636  // (via GenomeRegionList.add()), and that would already take care of the merging.
637  // But this here is likely faster, as we do not need to always remove and add
638  // intervals to the tree for consecutive runs of positions, which is most likely
639  // the most common use case (using a sorted VCF file).
640  auto const pos = it.record().get_position();
641  if( pos == cur_pos || pos == cur_pos + 1 ) {
642  cur_pos = pos;
643  } else {
644  // Otherwise, we are at a new interval, so we need to finish the current one.
645  assert( ! cur_chr.empty() );
646  insert_( cur_chr, beg_pos, cur_pos );
647 
648  // Now set the start of the next interval.
649  beg_pos = pos;
650  cur_pos = pos;
651  }
652  } else {
653  // We are at a new chromosome.
654 
655  // Unless we just started, we add the interval, again using the merge flag.
656  insert_( cur_chr, beg_pos, cur_pos );
657 
658  // Now set the start of the new interval.
659  cur_chr = it.record().get_chromosome();
660  beg_pos = it.record().get_position();
661  cur_pos = beg_pos;
662  }
663  ++it;
664  }
665 
666  // Finally, add the last interval that remains after the file is done.
667  insert_( cur_chr, beg_pos, cur_pos );
668 }
669 
670 // =================================================================================================
671 // VCF Genotype Functions
672 // =================================================================================================
673 
674 std::string vcf_genotype_string( std::vector<VcfGenotype> const& genotypes )
675 {
676  std::string result;
677  for( size_t i = 0; i < genotypes.size(); ++i ) {
678  auto const& g = genotypes[i];
679 
680  if( i > 0 ) {
681  result += ( g.is_phased() ? "|" : "/" );
682  }
683  result += ( g.is_missing() ? "." : std::to_string( g.variant_index() ));
684  }
685  return result;
686 }
687 
688 size_t vcf_genotype_sum( std::vector<VcfGenotype> const& genotypes )
689 {
690  size_t result = 0;
691  for( auto const& gt : genotypes ) {
692  result += static_cast<size_t>( gt.is_alternative() );
693  }
694  assert( result <= genotypes.size() );
695  return result;
696 }
697 
698 // =================================================================================================
699 // VCF Genotype
700 // =================================================================================================
701 
703 {
704  return bcf_gt_allele( genotype_ );
705 }
706 
708 {
709  return bcf_gt_allele( genotype_ ) == 0;
710 }
711 
713 {
714  return bcf_gt_allele( genotype_ ) > 0;
715 }
716 
718 {
719  return bcf_gt_is_missing( genotype_ );
720 }
721 
723 {
724  return bcf_gt_is_phased( genotype_ );
725 }
726 
727 int32_t VcfGenotype::data() const
728 {
729  return genotype_;
730 }
731 
732 } // namespace population
733 } // namespace genesis
734 
735 #endif // htslib guard
genesis::population::VcfGenotype::is_reference
bool is_reference() const
True iff the called variant of this genotype is the REF allele.
Definition: vcf_common.cpp:707
genesis::population::VcfGenotype::is_missing
bool is_missing() const
True iff the variant call is missing for this genotype.
Definition: vcf_common.cpp:717
genesis::population::VcfGenotype::variant_index
int32_t variant_index() const
Return the index of the variant set for this genotype call.
Definition: vcf_common.cpp:702
genesis::population::VcfRecord::header
VcfHeader & header()
Return the VcfHeader instance associated with this record.
Definition: vcf_record.hpp:213
genesis::population::VcfGenotype::is_phased
bool is_phased() const
True iff the called variant is phased.
Definition: vcf_common.cpp:722
genesis::population::merge
SampleCounts merge(SampleCounts const &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss.
Definition: population/function/functions.cpp:400
functions.hpp
genesis::population::VcfValueSpecial::kAllele
@ kAllele
genesis::population::GenomeRegionList::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_region_list.hpp:139
genesis::population::VcfValueSpecial::kGenotype
@ kGenotype
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
genesis::population::SampleCounts::a_count
size_type a_count
Count of all A nucleotides that are present in the sample.
Definition: sample_counts.hpp:66
genesis::population::Variant::position
size_t position
Definition: variant.hpp:74
genesis::population::vcf_hl_type_to_string
std::string vcf_hl_type_to_string(int hl_type)
Internal helper function to convert htslib-internal BCF_HL_* header line type values to their string ...
Definition: vcf_common.cpp:208
genesis::population::VcfValueType::kString
@ kString
genesis::population::VcfGenotype::data
int32_t data() const
Return the raw genotype value as used by htslib.
Definition: vcf_common.cpp:727
genesis::population::genome_region_list_from_vcf_file
GenomeRegionList genome_region_list_from_vcf_file(std::string const &file)
Read a VCF file, and use its positions to create a GenomeRegionList.
Definition: vcf_common.cpp:600
genesis::population::GenomeLocusSet
List of positions/coordinates in a genome, for each chromosome.
Definition: genome_locus_set.hpp:75
genesis::population::VcfHeaderLine::kFilter
@ kFilter
genesis::population::VcfValueSpecial
VcfValueSpecial
Specification for special markers for the number of values expected for key-value-pairs of VCF/BCF fi...
Definition: vcf_common.hpp:106
genesis::population::Variant::reference_base
char reference_base
Definition: variant.hpp:79
genesis::population::VcfValueSpecial::kVariable
@ kVariable
Variable number of possible values, or unknown, or unbounded. In VCF, this is denoted by '....
genesis::population::VcfValueType::kFloat
@ kFloat
genesis::population::VcfHeaderLine::kGeneric
@ kGeneric
genesis::population::GenomeRegionList
List of regions in a genome, for each chromosome.
Definition: genome_region_list.hpp:95
genesis::population::VcfHeaderLine::kContig
@ kContig
genesis::population::convert_to_variant_as_individuals
Variant convert_to_variant_as_individuals(VcfRecord const &record, bool use_allelic_depth)
Convert a VcfRecord to a Variant, treating each sample as an individual, and combining them all into ...
Definition: vcf_common.cpp:453
genesis::population::convert_to_variant_as_pool_set_missing_gt_
void convert_to_variant_as_pool_set_missing_gt_(VcfRecord const &record, Variant &variant)
Local helper function that sets the filter status of a Variant and its samples to missing depending o...
Definition: vcf_common.cpp:344
genesis::population::genome_locus_set_from_vcf_file
GenomeLocusSet genome_locus_set_from_vcf_file(std::string const &file)
Read a VCF file, and use its positions to create a GenomeLocusSet.
Definition: vcf_common.cpp:580
genesis::population::FilterStatus::set
void set(IntType value)
Definition: filter_status.hpp:100
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
sample_counts_filter.hpp
genesis::population::VcfValueSpecial::kReference
@ kReference
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::population::SampleCounts::t_count
size_type t_count
Count of all T nucleotides that are present in the sample.
Definition: sample_counts.hpp:81
sample_counts.hpp
genesis::population::VcfRecord::data
::bcf1_t * data()
Return the internal htslib bcf1_t record data struct pointer.
Definition: vcf_record.hpp:194
genesis::population::get_vcf_record_snp_ref_alt_chars_
std::pair< std::array< char, 6 >, size_t > get_vcf_record_snp_ref_alt_chars_(VcfRecord const &record)
Local helper function that returns the REF and ALT chars of a VcfRecord for SNPs.
Definition: vcf_common.cpp:235
genesis::population::VcfFormatIterator
Iterate the FORMAT information for the samples in a SNP/variant line in a VCF/BCF file.
Definition: vcf_format_iterator.hpp:62
vcf_common.hpp
genesis::population::VcfHeaderLine::kStructured
@ kStructured
genesis::population::SampleCounts::n_count
size_type n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
Definition: sample_counts.hpp:86
genesis::population::VcfHeader::get_sample_count
size_t get_sample_count() const
Get the number of samples (columns) in the file.
Definition: vcf_header.cpp:344
genesis::population::SampleCounts::c_count
size_type c_count
Count of all C nucleotides that are present in the sample.
Definition: sample_counts.hpp:71
genesis::population::vcf_genotype_sum
size_t vcf_genotype_sum(std::vector< VcfGenotype > const &genotypes)
Return the sum of genotypes for a set of VcfGenotype entries, typically used to construct a genotype ...
Definition: vcf_common.cpp:688
genesis::population::VcfValueType::kInteger
@ kInteger
genesis::population::VcfValueSpecial::kFixed
@ kFixed
Fixed number of values expected. In VCF, this is denoted simply by an integer number.
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis::population::VcfRecord::get_chromosome
std::string get_chromosome() const
Get the name of a chromosome/contig/sequence (CHROM, first column of the line).
Definition: vcf_record.cpp:170
genesis::population::convert_to_variant_as_pool_tally_bases_
void convert_to_variant_as_pool_tally_bases_(VcfRecord const &record, std::pair< std::array< char, 6 >, size_t > const &snp_chars, VcfFormatIteratorInt const &sample_ad, SampleCounts &sample)
Local helper function to tally up the bases form a VcfRecord into a SampleCounts.
Definition: vcf_common.cpp:279
genesis::population::VcfRecord::get_format_genotype
genesis::utils::Range< VcfFormatIteratorGenotype > get_format_genotype() const
Get an iterator pair over the samples that accesses the FORMAT genotype (GT field/key/id) as a set of...
Definition: vcf_record.cpp:562
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::vcf_value_special_to_string
std::string vcf_value_special_to_string(VcfValueSpecial vl_type_num)
Definition: vcf_common.cpp:175
genesis::population::VcfRecord::has_format
bool has_format(std::string const &id) const
Return whether the record has a given FORMAT id present.
Definition: vcf_record.cpp:522
genesis::population::SampleCountsFilterPolicy::kOnlyPassing
@ kOnlyPassing
genesis::population::VcfHeaderLine::kFormat
@ kFormat
variant_filter.hpp
char.hpp
genesis::population::VcfRecord::get_position
size_t get_position() const
Get the position within the chromosome/contig (POS, second column of the line).
Definition: vcf_record.cpp:181
genesis::population::VariantFilterTag::kMissing
@ kMissing
Position is missing in the input data.
genesis::population::vcf_genotype_string
std::string vcf_genotype_string(std::vector< VcfGenotype > const &genotypes)
Return the VCF-like string representation of a set of VcfGenotype entries.
Definition: vcf_common.cpp:674
genesis::population::VcfGenotype::is_alternative
bool is_alternative() const
True iff the called variant of this genotype is not the REF, but one of the ALT alleles.
Definition: vcf_common.cpp:712
genesis::population::VcfValueType
VcfValueType
Specification for the data type of the values expected in key-value-pairs of VCF/BCF files.
Definition: vcf_common.hpp:89
genesis::population::VcfFormatIterator::valid_value_count
size_t valid_value_count() const
Return the number of valid values for the current sample.
Definition: vcf_format_iterator.hpp:591
genesis::population::Variant::samples
std::vector< SampleCounts > samples
Definition: variant.hpp:82
genesis::population::VcfInputStream
Iterate an input source and parse it as a VCF/BCF file.
Definition: vcf_input_stream.hpp:76
genesis::population::VcfRecord
Capture the information of a single SNP/variant line in a VCF/BCF file.
Definition: vcf_record.hpp:107
genesis::population::Variant::alternative_base
char alternative_base
Definition: variant.hpp:80
variant.hpp
genesis::population::VcfHeaderLine::kInfo
@ kInfo
genesis::population::SampleCounts::d_count
size_type d_count
Count of all deleted (*) nucleotides that are present in the sample.
Definition: sample_counts.hpp:91
genesis::population::GenomeLocusSet::add
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
Definition: genome_locus_set.hpp:127
genesis::population::VcfRecord::unpack
void unpack() const
Unpack the htslib bcf1_t record data.
Definition: vcf_record.cpp:165
vcf_record.hpp
genesis::population::VcfRecord::get_format_int
genesis::utils::Range< VcfFormatIteratorInt > get_format_int(std::string const &id) const
Get an iterator pair over the samples that accesses a certain FORMAT id as an int value.
Definition: vcf_record.cpp:598
genesis::population::VcfFormatIterator::get_value_at
T get_value_at(size_t value_index) const
Get the value at a given value_index of the current sample.
Definition: vcf_format_iterator.hpp:647
vcf_input_stream.hpp
genesis::population::SampleCountsFilterTag::kMissing
@ kMissing
Position is missing in the input data.
genesis::population::vcf_value_type_to_string
std::string vcf_value_type_to_string(VcfValueType ht_type)
Definition: vcf_common.cpp:145
genesis::population::convert_to_variant_as_pool
Variant convert_to_variant_as_pool(VcfRecord const &record)
Convert a VcfRecord to a Variant, treating each sample column as a pool of individuals.
Definition: vcf_common.cpp:393
genesis::population::VcfValueType::kFlag
@ kFlag
genesis::population::Variant::status
FilterStatus status
Definition: variant.hpp:76
genesis::population::Variant::chromosome
std::string chromosome
Definition: variant.hpp:73
genesis::population::SampleCounts::g_count
size_type g_count
Count of all G nucleotides that are present in the sample.
Definition: sample_counts.hpp:76