|
A library for working with phylogenetic and population genetic data.
v0.32.0
|
|
Go to the documentation of this file.
45 #include <htslib/hts.h>
46 #include <htslib/vcf.h>
54 namespace population {
63 "Definitions of BCF_HL_FLT in htslib and of VcfHeaderLine::kFilter in genesis differ. "
64 "Please submit a bug report at https://github.com/lczech/genesis/issues"
68 "Definitions of BCF_HL_INFO in htslib and of VcfHeaderLine::kInfo in genesis differ. "
69 "Please submit a bug report at https://github.com/lczech/genesis/issues"
73 "Definitions of BCF_HL_FMT in htslib and of VcfHeaderLine::kFormat in genesis differ. "
74 "Please submit a bug report at https://github.com/lczech/genesis/issues"
78 "Definitions of BCF_HL_CTG in htslib and of VcfHeaderLine::kContig in genesis differ. "
79 "Please submit a bug report at https://github.com/lczech/genesis/issues"
83 "Definitions of BCF_HL_STR in htslib and of VcfHeaderLine::kStructured in genesis differ. "
84 "Please submit a bug report at https://github.com/lczech/genesis/issues"
88 "Definitions of BCF_HL_GEN in htslib and of VcfHeaderLine::kGeneric in genesis differ. "
89 "Please submit a bug report at https://github.com/lczech/genesis/issues"
95 "Definitions of BCF_HT_FLAG in htslib and of VcfValueType::kFlag in genesis differ. "
96 "Please submit a bug report at https://github.com/lczech/genesis/issues"
100 "Definitions of BCF_HT_INT in htslib and of VcfValueType::kInteger in genesis differ. "
101 "Please submit a bug report at https://github.com/lczech/genesis/issues"
105 "Definitions of BCF_HT_REAL in htslib and of VcfValueType::kFloat in genesis differ. "
106 "Please submit a bug report at https://github.com/lczech/genesis/issues"
110 "Definitions of BCF_HT_STR in htslib and of VcfValueType::kString in genesis differ. "
111 "Please submit a bug report at https://github.com/lczech/genesis/issues"
117 "Definitions of BCF_VL_FIXED in htslib and of VcfValueSpecial::kFixed in genesis differ. "
118 "Please submit a bug report at https://github.com/lczech/genesis/issues"
122 "Definitions of BCF_VL_VAR in htslib and of VcfValueSpecial::kVariable in genesis differ. "
123 "Please submit a bug report at https://github.com/lczech/genesis/issues"
127 "Definitions of BCF_VL_A in htslib and of VcfValueSpecial::kAllele in genesis differ. "
128 "Please submit a bug report at https://github.com/lczech/genesis/issues"
132 "Definitions of BCF_VL_G in htslib and of VcfValueSpecial::kGenotype in genesis differ. "
133 "Please submit a bug report at https://github.com/lczech/genesis/issues"
137 "Definitions of BCF_VL_R in htslib and of VcfValueSpecial::kReference in genesis differ. "
138 "Please submit a bug report at https://github.com/lczech/genesis/issues"
166 throw std::domain_error(
"Invalid value type provided: " +
std::to_string( ht_type ));
182 switch( vl_type_num ) {
187 return "variable (.)";
193 return "genotype (G)";
196 return "reference (R)";
199 throw std::domain_error(
"Invalid value number provided: " +
std::to_string( vl_type_num ));
211 case BCF_HL_FLT:
return "FILTER";
212 case BCF_HL_INFO:
return "INFO";
213 case BCF_HL_FMT:
return "FORMAT";
214 case BCF_HL_CTG:
return "CONTIG";
215 case BCF_HL_STR:
return "Structured header line";
216 case BCF_HL_GEN:
return "Generic header line";
218 throw std::invalid_argument(
"Invalid header line type: " +
std::to_string( hl_type ));
244 auto rec_data = record.
data();
249 auto vars = std::array<char, 6>{{
'.',
'.',
'.',
'.',
'.',
'.' }};
250 if( rec_data->n_allele > 6 ) {
251 throw std::runtime_error(
252 "Invalid VCF Record that contains a REF or ALT sequence/allele with "
253 "invalid nucleitides where only `[ACGTN.]` are allowed, at " +
261 for(
size_t i = 0; i < rec_data->n_allele; ++i ) {
262 if( std::strlen( rec_data->d.allele[i] ) != 1 ) {
263 throw std::runtime_error(
264 "Cannot convert VcfRecord to Variant, as one of the VcfRecord REF or ALT "
265 "sequences/alleles is not a single nucleotide (it is not a SNP), at " +
267 ". At the time being, we are not supporting indels and other such variants."
270 vars[i] = *rec_data->d.allele[i];
273 return { vars, rec_data->n_allele };
281 std::pair<std::array<char, 6>,
size_t>
const& snp_chars,
291 throw std::runtime_error(
292 "Invalid VCF Record with FORMAT field 'AD' value < 0 for a sample, at " +
298 switch( snp_chars.first[i] ) {
329 throw std::runtime_error(
330 "Invalid VCF Record that contains a REF or ALT sequence/allele with "
331 "invalid nucleitide `" + std::string( 1, snp_chars.first[i] ) +
332 "` where only `[ACGTN*]` are allowed, at " +
349 auto throw_invalid_gt_size_ = [&](){
350 throw std::runtime_error(
351 "Invalid VCF Record with number of samples in the record (" +
353 ") not equal to the number of GT fields, at " +
359 size_t sample_idx = 0;
360 size_t missing_cnt = 0;
362 if( sample_idx >= variant.
samples.size() ) {
363 throw_invalid_gt_size_();
367 bool is_missing =
false;
368 for(
size_t i = 0; i < sample_gt.valid_value_count(); ++i ) {
369 auto const gt = sample_gt.get_value_at(i);
370 if( gt.is_missing() ) {
383 if( sample_idx != variant.
samples.size() ) {
384 throw_invalid_gt_size_();
388 if( missing_cnt == variant.
samples.size() ) {
397 throw std::runtime_error(
398 "Cannot convert VcfRecord to Variant, as the VcfRecord does not have "
399 "the required FORMAT field 'AD' for alleleic depth."
420 if( sample_ad.valid_value_count() > 0 && sample_ad.valid_value_count() != snp_chars.second ) {
421 throw std::runtime_error(
422 "Invalid VCF Record that contains " +
std::to_string( snp_chars.second ) +
423 " REF and ALT sequences/alleles, but its FORMAT field 'AD' only contains " +
424 std::to_string( sample_ad.valid_value_count() ) +
" entries, at " +
432 auto& sample = result.
samples.back();
438 throw std::runtime_error(
439 "Invalid VCF Record with number of samples in the record (" +
455 bool use_allelic_depth
461 if( use_allelic_depth ) {
474 throw std::runtime_error(
475 "Cannot convert VcfRecord to Variant, as the VcfRecord does not have "
476 "the required FORMAT field 'GT' for genotypes."
492 auto& sample = result.
samples.back();
495 size_t total_cnt = 0;
496 size_t missing_cnt = 0;
502 for(
size_t i = 0; i < sample_gt.valid_value_count(); ++i ) {
507 auto const gt = sample_gt.get_value_at(i);
508 auto const gt_int = gt.variant_index();
512 if( !( gt_int < 0 ||
static_cast<size_t>(gt_int) < snp_chars.second )) {
513 throw std::runtime_error(
514 "Invalid VCF Record that contains an index " +
std::to_string( gt_int ) +
515 " into the genotype list that does not exist, at " +
521 if( gt.is_missing() ) {
533 switch( snp_chars.first[gt_int] ) {
560 throw std::runtime_error(
561 "Invalid VCF Record that contains a REF or ALT sequence/allele with "
562 "invalid nucleitide `" + std::string( 1, snp_chars.first[i] ) +
563 "` where only `[ACGTN.]` are allowed, at " +
572 if( missing_cnt == total_cnt ) {
594 result.
add( it.record().get_chromosome(), it.record().get_position() );
615 auto insert_ = [&]( std::string
const& chr,
size_t beg,
size_t end ){
619 assert( beg > 0 && end > 0 );
620 assert( beg <= end );
624 target.
add( chr, beg, end,
true );
630 if( it.record().get_chromosome() == cur_chr ) {
640 auto const pos = it.record().get_position();
641 if( pos == cur_pos || pos == cur_pos + 1 ) {
645 assert( ! cur_chr.empty() );
646 insert_( cur_chr, beg_pos, cur_pos );
656 insert_( cur_chr, beg_pos, cur_pos );
659 cur_chr = it.record().get_chromosome();
660 beg_pos = it.record().get_position();
667 insert_( cur_chr, beg_pos, cur_pos );
677 for(
size_t i = 0; i < genotypes.size(); ++i ) {
678 auto const& g = genotypes[i];
681 result += ( g.is_phased() ?
"|" :
"/" );
683 result += ( g.is_missing() ?
"." :
std::to_string( g.variant_index() ));
691 for(
auto const& gt : genotypes ) {
692 result +=
static_cast<size_t>( gt.is_alternative() );
694 assert( result <= genotypes.size() );
704 return bcf_gt_allele( genotype_ );
709 return bcf_gt_allele( genotype_ ) == 0;
714 return bcf_gt_allele( genotype_ ) > 0;
719 return bcf_gt_is_missing( genotype_ );
724 return bcf_gt_is_phased( genotype_ );
735 #endif // htslib guard
bool is_reference() const
True iff the called variant of this genotype is the REF allele.
bool is_missing() const
True iff the variant call is missing for this genotype.
int32_t variant_index() const
Return the index of the variant set for this genotype call.
VcfHeader & header()
Return the VcfHeader instance associated with this record.
bool is_phased() const
True iff the called variant is phased.
SampleCounts merge(SampleCounts const &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss.
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
size_type a_count
Count of all A nucleotides that are present in the sample.
std::string vcf_hl_type_to_string(int hl_type)
Internal helper function to convert htslib-internal BCF_HL_* header line type values to their string ...
int32_t data() const
Return the raw genotype value as used by htslib.
GenomeRegionList genome_region_list_from_vcf_file(std::string const &file)
Read a VCF file, and use its positions to create a GenomeRegionList.
List of positions/coordinates in a genome, for each chromosome.
VcfValueSpecial
Specification for special markers for the number of values expected for key-value-pairs of VCF/BCF fi...
@ kVariable
Variable number of possible values, or unknown, or unbounded. In VCF, this is denoted by '....
List of regions in a genome, for each chromosome.
Variant convert_to_variant_as_individuals(VcfRecord const &record, bool use_allelic_depth)
Convert a VcfRecord to a Variant, treating each sample as an individual, and combining them all into ...
void convert_to_variant_as_pool_set_missing_gt_(VcfRecord const &record, Variant &variant)
Local helper function that sets the filter status of a Variant and its samples to missing depending o...
GenomeLocusSet genome_locus_set_from_vcf_file(std::string const &file)
Read a VCF file, and use its positions to create a GenomeLocusSet.
std::string to_string(GenomeLocus const &locus)
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
size_type t_count
Count of all T nucleotides that are present in the sample.
::bcf1_t * data()
Return the internal htslib bcf1_t record data struct pointer.
std::pair< std::array< char, 6 >, size_t > get_vcf_record_snp_ref_alt_chars_(VcfRecord const &record)
Local helper function that returns the REF and ALT chars of a VcfRecord for SNPs.
size_type n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
size_type c_count
Count of all C nucleotides that are present in the sample.
size_t vcf_genotype_sum(std::vector< VcfGenotype > const &genotypes)
Return the sum of genotypes for a set of VcfGenotype entries, typically used to construct a genotype ...
@ kFixed
Fixed number of values expected. In VCF, this is denoted simply by an integer number.
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
std::string get_chromosome() const
Get the name of a chromosome/contig/sequence (CHROM, first column of the line).
void convert_to_variant_as_pool_tally_bases_(VcfRecord const &record, std::pair< std::array< char, 6 >, size_t > const &snp_chars, VcfFormatIteratorInt const &sample_ad, SampleCounts &sample)
Local helper function to tally up the bases form a VcfRecord into a SampleCounts.
genesis::utils::Range< VcfFormatIteratorGenotype > get_format_genotype() const
Get an iterator pair over the samples that accesses the FORMAT genotype (GT field/key/id) as a set of...
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
std::string vcf_value_special_to_string(VcfValueSpecial vl_type_num)
bool has_format(std::string const &id) const
Return whether the record has a given FORMAT id present.
size_t get_position() const
Get the position within the chromosome/contig (POS, second column of the line).
@ kMissing
Position is missing in the input data.
std::string vcf_genotype_string(std::vector< VcfGenotype > const &genotypes)
Return the VCF-like string representation of a set of VcfGenotype entries.
bool is_alternative() const
True iff the called variant of this genotype is not the REF, but one of the ALT alleles.
VcfValueType
Specification for the data type of the values expected in key-value-pairs of VCF/BCF files.
std::vector< SampleCounts > samples
Capture the information of a single SNP/variant line in a VCF/BCF file.
size_type d_count
Count of all deleted (*) nucleotides that are present in the sample.
void add(std::string const &chromosome)
Add a whole chromosome to the list, so that all its positions are considered to be covered.
void unpack() const
Unpack the htslib bcf1_t record data.
genesis::utils::Range< VcfFormatIteratorInt > get_format_int(std::string const &id) const
Get an iterator pair over the samples that accesses a certain FORMAT id as an int value.
@ kMissing
Position is missing in the input data.
std::string vcf_value_type_to_string(VcfValueType ht_type)
Variant convert_to_variant_as_pool(VcfRecord const &record)
Convert a VcfRecord to a Variant, treating each sample column as a pool of individuals.
size_type g_count
Count of all G nucleotides that are present in the sample.